Closed happyi closed 1 year ago
please check your cpu info (/proc/cpuinfo)
by default, starrocks_be requires cpu to support avx2
.
same problem.
[root@localhost starrocks]# kubectl logs starrockscluster-be-0
[Wed May 31 00:56:20 UTC 2023] Empty $CONFIGMAP_MOUNT_PATH env var, skip it!
[Wed May 31 00:56:20 UTC 2023] Add myself (starrockscluster-be-0.starrockscluster-be-search.starrocks.svc.cluster.local:9050) into FE ...
ERROR 1064 (HY000) at line 1: Unexpected exception: Same backend already exists[starrockscluster-be-0.starrockscluster-be-search.starrocks.svc.cluster.local:9050]
[Wed May 31 00:56:20 UTC 2023] run start_be.sh
/opt/starrocks/be/bin/start_backend.sh: line 185: 808 Illegal instruction (core dumped) ${START_BE_CMD} "$@" >> ${LOG_FILE} 2>&1 < /dev/null
and k8s node already enable avx2.
[root@localhost starrocks]# kubectl get pod -o wide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
starrocks-controller-79b5f48bf6-bkx48 1/1 Running 2 20h 10.42.5.102 rancher2-worker7 <none> <none>
starrockscluster-be-0 0/1 CrashLoopBackOff 16 61m 10.42.24.42 rancher2-worker15 <none> <none>
starrockscluster-be-1 0/1 CrashLoopBackOff 211 17h 10.42.18.89 rancher2-worker10 <none> <none>
starrockscluster-be-2 0/1 CrashLoopBackOff 210 17h 10.42.19.53 rancher2-worker11 <none> <none>
starrockscluster-fe-0 1/1 Running 0 17h 10.42.20.54 rancher2-worker12 <none> <none>
starrockscluster-fe-1 1/1 Running 0 17h 10.42.23.37 rancher2-worker14 <none> <none>
starrockscluster-fe-2 1/1 Running 0 17h 10.42.21.49 rancher2-worker13 <none> <none>
on rancher2-worker15:
[root@rancher2-worker15 ~]# hostname
rancher2-worker15
[root@rancher2-worker15 ~]# lscpu | grep Flags|grep avx2
Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx lm constant_tsc nopl xtopology cpuid tsc_known_freq pni ssse3 cx16 sse4_1 sse4_2 x2apic avx hypervisor lahf_lm cpuid_fault pti avx2 bmi2
on rancher2-worker10:
[root@rancher2-worker10 ~]# hostname
rancher2-worker10
[root@rancher2-worker10 ~]# lscpu | grep Flags|grep avx2
Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx lm constant_tsc nopl xtopology cpuid tsc_known_freq pni ssse3 cx16 sse4_1 sse4_2 x2apic avx hypervisor lahf_lm cpuid_fault pti avx2 bmi2
on rancher2-worker11:
[root@rancher2-worker11 ~]# hostname
rancher2-worker11
[root@rancher2-worker11 ~]# lscpu | grep Flags|grep avx2
Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx lm constant_tsc nopl xtopology cpuid tsc_known_freq pni ssse3 cx16 sse4_1 sse4_2 x2apic avx hypervisor lahf_lm cpuid_fault pti avx2 bmi2
starrocks-fe-and-be.yaml:
apiVersion: starrocks.com/v1alpha1
kind: StarRocksCluster
metadata:
name: starrockscluster
namespace: starrocks
spec:
starRocksFeSpec:
image: starrocks/fe-ubuntu:2.5.4
replicas: 3
requests:
cpu: 1
memory: 3Gi
nodeSelector:
dedicated: starrocks
# affinity:
# nodeAffinity:
# preferredDuringSchedulingIgnoredDuringExecution:
# - weight: 1
# preference:
# matchExpressions:
# - key: dedicated
# operator: In
# values:
# - starrocks
tolerations:
- key: "dedicated"
operator: "Equal"
value: "starrocks"
effect: "NoSchedule"
storageVolumes:
- name: fe-meta
storageClassName: ceph-dba
storageSize: 10Gi
mountPath: /opt/starrocks/fe/meta # overwrite the default meta path
starRocksBeSpec:
image: starrocks/be-ubuntu:2.5.4
replicas: 3
requests:
cpu: 1
memory: 3Gi
nodeSelector:
dedicated: starrocks
tolerations:
- key: "dedicated"
operator: "Equal"
value: "starrocks"
effect: "NoSchedule"
storageVolumes:
- name: be-data
storageClassName: ceph-dba
storageSize: 30Gi
mountPath: /opt/starrocks/be/storage # overwrite the default data path
help
@welyss are you able to apply following pod spec and then manual start the BE process, to get the detailed error log
apiVersion: v1
kind: Pod
metadata:
name: starrocks-be-single-pod
spec:
containers:
- args:
- "86400"
command:
- sleep
image: starrocks/be-ubuntu:2.5.4
imagePullPolicy: IfNotPresent
name: be-single-pod
resources:
limits:
cpu: "4"
memory: 8Gi
requests:
cpu: "1"
memory: 3Gi
After the pod is running, run following command to manual start the be process
kubectl exec -it starrocks-be-single-pod -- /opt/starrocks/be/bin/start_be.sh
This command is expected to crash with the same error as you experienced.
And then please send us the following log files /opt/starrocks/be/log/be.out
/opt/starrocks/be/log/be.INFO
@kevincai , thanks reply.
[root@localhost starrocks]# cat starrocks-be-single-pod.yaml
apiVersion: v1
kind: Pod
metadata:
name: starrocks-be-single-pod
namespace: starrocks
spec:
containers:
- args:
- "86400"
command:
- sleep
image: starrocks/be-ubuntu:2.5.4
imagePullPolicy: IfNotPresent
name: be-single-pod
resources:
limits:
cpu: "4"
memory: 8Gi
requests:
cpu: "1"
memory: 3Gi
tolerations:
- key: "dedicated"
operator: "Equal"
value: "starrocks"
effect: "NoSchedule"
nodeSelector:
dedicated: starrocks
[root@localhost starrocks]# kubectl get pod starrocks-be-single-pod -o wide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
starrocks-be-single-pod 1/1 Running 0 29m 10.42.24.43 rancher2-worker15 <none> <none>
then run the command:
[root@localhost starrocks]# kubectl exec -it starrocks-be-single-pod -- /opt/starrocks/be/bin/start_be.sh
/opt/starrocks/be/bin/start_backend.sh: line 185: 797 Illegal instruction (core dumped) ${START_BE_CMD} "$@" >> ${LOG_FILE} 2>&1 < /dev/null
command terminated with exit code 132
here is log:
[root@localhost starrocks]# kubectl exec -it starrocks-be-single-pod -- /bin/bash
root@starrocks-be-single-pod:/opt/starrocks# cat /opt/starrocks/be/log/be.out
start time: Wed May 31 06:17:45 UTC 2023
root@starrocks-be-single-pod:/opt/starrocks# cat /opt/starrocks/be/log/be.INFO
cat: /opt/starrocks/be/log/be.INFO: No such file or directory
@welyss can you check if there is any related information from dmesg
command output, after the Illegal instruction error?
@welyss can you check if there is any related information from
dmesg
command output, after the Illegal instruction error?
root@starrocks-be-single-pod:/opt/starrocks# /opt/starrocks/be/bin/start_be.sh
/opt/starrocks/be/bin/start_backend.sh: line 185: 1608 Illegal instruction (core dumped) ${START_BE_CMD} "$@" >> ${LOG_FILE} 2>&1 < /dev/null
root@starrocks-be-single-pod:/opt/starrocks# dmesg
Thanks, will check the following error in dmesg
[81826.337385] traps: starrocks_be[652] trap invalid opcode ip:2c1101b sp:7ffdc6698860 error:0 in starrocks_be[27ec000+5884000]
[82142.091083] traps: starrocks_be[6919] trap invalid opcode ip:2c1101b sp:7ffe2e24ed50 error:0 in starrocks_be[27ec000+5884000]
[82461.279073] traps: starrocks_be[12117] trap invalid opcode ip:2c1101b sp:7ffdf09ce070 error:0 in starrocks_be[27ec000+5884000]
[82544.108128] traps: starrocks_be[14115] trap invalid opcode ip:2c1101b sp:7ffe2dd7f220 error:0 in starrocks_be[27ec000+5884000]
@welyss can you run uname -a
in the pod, need to confirm the kernel version. thanks!
and also, if possible, please paste the full content of lscpu
, help us to understand why it reports invalid opcode.
ok, this is on k8s worker node which starrocks-be scheduled.
[root@rancher2-worker15 ~]# hostname
rancher2-worker15
[root@rancher2-worker15 ~]# uname -a
Linux rancher2-worker15 4.20.7-1.el7.elrepo.x86_64 #1 SMP Wed Feb 6 13:17:46 EST 2019 x86_64 x86_64 x86_64 GNU/Linux
[root@rancher2-worker15 ~]# lscpu
Architecture: x86_64
CPU op-mode(s): 32-bit, 64-bit
Byte Order: Little Endian
CPU(s): 4
On-line CPU(s) list: 0-3
Thread(s) per core: 1
Core(s) per socket: 2
Socket(s): 2
NUMA node(s): 1
Vendor ID: GenuineIntel
CPU family: 15
Model: 6
Model name: Common KVM processor
Stepping: 1
CPU MHz: 2095.076
BogoMIPS: 4190.15
Hypervisor vendor: KVM
Virtualization type: full
L1d cache: 32K
L1i cache: 32K
L2 cache: 4096K
L3 cache: 16384K
NUMA node0 CPU(s): 0-3
Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx lm constant_tsc nopl xtopology cpuid tsc_known_freq pni ssse3 cx16 sse4_1 sse4_2 x2apic avx hypervisor lahf_lm cpuid_fault pti avx2 bmi2
in the pod:
[root@localhost starrocks]# kubectl exec -it starrocks-be-single-pod -- /bin/bash
root@starrocks-be-single-pod:/opt/starrocks# uname -a
Linux starrocks-be-single-pod 4.20.7-1.el7.elrepo.x86_64 #1 SMP Wed Feb 6 13:17:46 EST 2019 x86_64 x86_64 x86_64 GNU/Linux
root@starrocks-be-single-pod:/opt/starrocks# lscpu
Architecture: x86_64
CPU op-mode(s): 32-bit, 64-bit
Address sizes: 46 bits physical, 48 bits virtual
Byte Order: Little Endian
CPU(s): 4
On-line CPU(s) list: 0-3
Vendor ID: GenuineIntel
Model name: Common KVM processor
CPU family: 15
Model: 6
Thread(s) per core: 1
Core(s) per socket: 2
Socket(s): 2
Stepping: 1
BogoMIPS: 4190.15
Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx lm constant_tsc nopl xtopology cpuid tsc_known_freq pni ssse3 cx16 sse4_1 sse4_2 x2apic avx hypervisor lahf_lm cpuid_fault pti avx2
bmi2
Virtualization features:
Hypervisor vendor: KVM
Virtualization type: full
Caches (sum of all):
L1d: 128 KiB (4 instances)
L1i: 128 KiB (4 instances)
L2: 16 MiB (4 instances)
L3: 32 MiB (2 instances)
NUMA:
NUMA node(s): 1
NUMA node0 CPU(s): 0-3
Vulnerabilities:
L1tf: Mitigation; PTE Inversion
Meltdown: Mitigation; PTI
Spec store bypass: Vulnerable
Spectre v1: Mitigation; __user pointer sanitization
Spectre v2: Mitigation; Full generic retpoline, STIBP disabled, RSB filling
@welyss appreciated for the information.
Some update on this issue:
What we know
vpxor
which is a AVX2 instructionWhat we don't understand
In the meanwhile, can you try use this image lvlouisaslia/allin1-ubuntu:3.0.0-no-avx2-no-sse42
instead of starrocks/be-ubuntu:2.5.4
in the starrocks-be-single-pod.yaml, and try start the BE process by
[root@localhost starrocks]# kubectl exec -it starrocks-be-single-pod -- /data/deploy/starrocks/be/bin/start_be.sh
check if it still crashes or not.
@welyss appreciated for the information.
Some update on this issue:
What we know
* the binary crashes on a specific cpu instruction `vpxor` which is a AVX2 instruction
What we don't understand
* your CPU supports avx2 extension, why it cores anyway.
In the meanwhile, can you try use this image
lvlouisaslia/allin1-ubuntu:3.0.0-no-avx2-no-sse42
instead ofstarrocks/be-ubuntu:2.5.4
in the starrocks-be-single-pod.yaml, and try start the BE process by[root@localhost starrocks]# kubectl exec -it starrocks-be-single-pod -- /data/deploy/starrocks/be/bin/start_be.sh
check if it still crashes or not.
ok, thankyou for help, image lvlouisaslia/allin1-ubuntu:3.0.0-no-avx2-no-sse42
will hold on frontend, no crash, guess it works.
then we change cpu model of kvm from 'avx' to 'host', it works. it looks like [lscpu] can't definitely tell the cpu instruction has been loaded in vm system, perhaps there is something wrong with our kvm conf in cpu model 'avx'.
ok, thankyou for help, image lvlouisaslia/allin1-ubuntu:3.0.0-no-avx2-no-sse42 will hold on frontend, no crash, guess it works.
Yes, it worked then.
then we change cpu model of kvm from 'avx' to 'host', it works. it looks like [lscpu] can't definitely tell the cpu instruction has been loaded in vm system, perhaps there is something wrong with our kvm conf in cpu model 'avx'.
That's good to know, glad that the issue is solved. I will take a look at KVM virtualization related info.
Close this issue for now.
env: Kubernetes v1.26.1 operator: starrocks/operator:latest
deployment: starrocks/fe-ubuntu:2.5.4 starrocks/be-ubuntu:2.5.4
fe work well , be can't be started
[Mon Apr 17 08:00:50 UTC 2023] Empty $CONFIGMAP_MOUNT_PATH env var, skip it! [Mon Apr 17 08:00:50 UTC 2023] Add myself (starrockscluster-be-0.starrockscluster-be-search.open.svc.k8s.com:9050) into FE ... ERROR 1064 (HY000) at line 1: Unexpected exception: Same backend already exists[starrockscluster-be-0.starrockscluster-be-search.open.svc.k8s.com:9050] [Mon Apr 17 08:00:50 UTC 2023] run start_be.sh /opt/starrocks/be/bin/start_backend.sh: line 185: 807 Illegal instruction (core dumped) ${START_BE_CMD} "$@" >> ${LOG_FILE} 2>&1 < /dev/null