Closed ruhengChen closed 4 months ago
是的,根据提供的日志来看,alice节点报错提示是:OOMKilled 内存不足导致。 kuscia最少需要6g内存,如果使用docker环境运行,可以使用docker update --memory 调整内存资源
你好,目前kuscia的docker内存我们都是调整过的
您这边跑psi任务的数据量有多大
我刚开始用了百万的数据 也怀疑是数据量太大 但是后面我试了几十条 也不行
free -m 显示您的系统整体内存15G,我看还起了两个autonomy-secretpad容器,已经占系统内存的50%以上了,所以还是系统整体内存不足导致,您这边的场景是什么?如果只是单机体验kuscia psi的话,可以只保留两个autonomy节点再试下。
我把两个 secretpad 都停了 ,然后跑了示例任务 还是失败了 docker exec -it ${USER}-kuscia-autonomy-alice scripts/user/create_example_job.sh
[root@root-kuscia-autonomy-alice-ecs-46f7 kuscia]# kubectl get kj -n cross-domain NAME STARTTIME COMPLETIONTIME LASTRECONCILETIME PHASE wilf 21h 20h 20h Failed xcwj 20h 20h 20h Failed pyqd 20h 20h 20h Failed eltr 18h 18h 18h Failed xzkt 16h 16h 16h Failed owke 16h 16h 16h Failed secretflow-task-20240717103430 66s 26s 26s Failed
[root@root-kuscia-autonomy-alice-ecs-46f7 kuscia]# kubectl get pods secretflow-task-20240717103430-single-psi-0 -o yaml -n alice
apiVersion: v1
kind: Pod
metadata:
annotations:
kuscia.secretflow/config-template-volumes: config-template
kuscia.secretflow/initiator: alice
kuscia.secretflow/task-id: secretflow-task-20240717103430-single-psi
kuscia.secretflow/task-resource: secretflow-task-20240717103430-single-psi-f2238d0630f8
kuscia.secretflow/task-resource-group: secretflow-task-20240717103430-single-psi
creationTimestamp: "2024-07-17T02:34:31Z"
labels:
kuscia.secretflow/communication-role-client: "true"
kuscia.secretflow/communication-role-server: "true"
kuscia.secretflow/controller: kusciatask
kuscia.secretflow/pod-identity: fa6189fc-6c7a-4826-8c41-797cf89c9417-0
kuscia.secretflow/pod-role: ""
kuscia.secretflow/task-resource-uid: 10a8fade-9da0-4d32-b1bf-fcab22828c3d
kuscia.secretflow/task-uid: fa6189fc-6c7a-4826-8c41-797cf89c9417
name: secretflow-task-20240717103430-single-psi-0
namespace: alice
resourceVersion: "220100"
uid: ee29d7e2-55ec-45e5-882b-241b5d8a1913
spec:
automountServiceAccountToken: false
containers:
- args:
- -c
- python -m secretflow.kuscia.entry ./kuscia/task-config.conf
command:
- sh
env:
- name: KUSCIA_DOMAIN_ID
value: alice
- name: TASK_ID
value: secretflow-task-20240717103430-single-psi
- name: TASK_CLUSTER_DEFINE
value: '{"parties":[{"name":"alice", "role":"", "services":[{"portName":"spu",
"endpoints":["secretflow-task-20240717103430-single-psi-0-spu.alice.svc"]},
{"portName":"fed", "endpoints":["secretflow-task-20240717103430-single-psi-0-fed.alice.svc"]},
{"portName":"global", "endpoints":["secretflow-task-20240717103430-single-psi-0-global.alice.svc:27493"]}]},
{"name":"bob", "role":"", "services":[{"portName":"spu", "endpoints":["secretflow-task-20240717103430-single-psi-0-spu.bob.svc"]},
{"portName":"fed", "endpoints":["secretflow-task-20240717103430-single-psi-0-fed.bob.svc"]},
{"portName":"global", "endpoints":["secretflow-task-20240717103430-single-psi-0-global.bob.svc:20002"]}]}],
"selfPartyIdx":0, "selfEndpointIdx":0}'
- name: ALLOCATED_PORTS
value: '{"ports":[{"name":"client-server", "port":27490, "scope":"Local", "protocol":"GRPC"},
{"name":"spu", "port":27491, "scope":"Cluster", "protocol":"GRPC"}, {"name":"fed",
"port":27492, "scope":"Cluster", "protocol":"GRPC"}, {"name":"global", "port":27493,
"scope":"Domain", "protocol":"GRPC"}, {"name":"node-manager", "port":27494,
"scope":"Local", "protocol":"GRPC"}, {"name":"object-manager", "port":27495,
"scope":"Local", "protocol":"GRPC"}]}'
- name: TASK_INPUT_CONFIG
value: '{"sf_datasource_config":{"alice":{"id":"default-data-source"},"bob":{"id":"default-data-source"}},"sf_cluster_desc":{"parties":["alice","bob"],"devices":[{"name":"spu","type":"spu","parties":["alice","bob"],"config":"{\"runtime_config\":{\"protocol\":\"REF2K\",\"field\":\"FM64\"},\"link_desc\":{\"connect_retry_times\":60,\"connect_retry_interval_ms\":1000,\"brpc_channel_protocol\":\"http\",\"brpc_channel_connection_type\":\"pooled\",\"recv_timeout_ms\":1200000,\"http_timeout_ms\":1200000}}"},{"name":"heu","type":"heu","parties":["alice","bob"],"config":"{\"mode\":
\"PHEU\", \"schema\": \"paillier\", \"key_size\": 2048}"}],"ray_fed_config":{"cross_silo_comm_backend":"brpc_link"}},"sf_node_eval_param":{"domain":"data_prep","name":"psi","version":"0.0.5","attr_paths":["protocol","sort_result","allow_duplicate_keys","allow_duplicate_keys/yes/join_type","allow_duplicate_keys/yes/join_type/left_join/left_side","input/receiver_input/key","input/sender_input/key"],"attrs":[{"s":"PROTOCOL_ECDH"},{"b":true},{"s":"yes"},{"s":"left_join"},{"ss":["alice"]},{"ss":["id1"]},{"ss":["id2"]}]},"sf_input_ids":["alice-table","bob-table"],"sf_output_ids":["psi-output"],"sf_output_uris":["psi-output.csv"]}'
- name: KUSCIA_PORT_CLIENT_SERVER_NUMBER
value: "27490"
- name: KUSCIA_PORT_SPU_NUMBER
value: "27491"
- name: KUSCIA_PORT_FED_NUMBER
value: "27492"
- name: KUSCIA_PORT_GLOBAL_NUMBER
value: "27493"
- name: KUSCIA_PORT_NODE_MANAGER_NUMBER
value: "27494"
- name: KUSCIA_PORT_OBJECT_MANAGER_NUMBER
value: "27495"
image: secretflow-registry.cn-hangzhou.cr.aliyuncs.com/secretflow/secretflow-lite-anolis8:1.7.0b0
imagePullPolicy: IfNotPresent
name: secretflow
ports:
- containerPort: 27491
name: spu
protocol: TCP
- containerPort: 27492
name: fed
protocol: TCP
- containerPort: 27493
name: global
protocol: TCP
- containerPort: 27494
name: node-manager
protocol: TCP
- containerPort: 27495
name: object-manager
protocol: TCP
- containerPort: 27490
name: client-server
protocol: TCP
resources: {}
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: FallbackToLogsOnError
volumeMounts:
- mountPath: /root/kuscia/task-config.conf
name: config-template
subPath: task-config.conf
workingDir: /root
dnsPolicy: ClusterFirst
enableServiceLinks: true
nodeName: root-kuscia-autonomy-alice-ecs-46f7
nodeSelector:
kuscia.secretflow/namespace: alice
preemptionPolicy: PreemptLowerPriority
priority: 0
restartPolicy: Never
schedulerName: kuscia-scheduler
securityContext: {}
serviceAccount: default
serviceAccountName: default
terminationGracePeriodSeconds: 30
tolerations:
- effect: NoSchedule
key: kuscia.secretflow/agent
operator: Exists
- effect: NoExecute
key: node.kubernetes.io/not-ready
operator: Exists
tolerationSeconds: 300
- effect: NoExecute
key: node.kubernetes.io/unreachable
operator: Exists
tolerationSeconds: 300
volumes:
- configMap:
defaultMode: 420
name: secretflow-task-20240717103430-single-psi-configtemplate
name: config-template
status:
conditions:
- lastProbeTime: null
lastTransitionTime: "2024-07-17T02:34:32Z"
status: "True"
type: Initialized
- lastProbeTime: null
lastTransitionTime: "2024-07-17T02:34:56Z"
reason: PodFailed
status: "False"
type: Ready
- lastProbeTime: null
lastTransitionTime: "2024-07-17T02:34:56Z"
reason: PodFailed
status: "False"
type: ContainersReady
- lastProbeTime: null
lastTransitionTime: "2024-07-17T02:34:32Z"
status: "True"
type: PodScheduled
containerStatuses:
- containerID: containerd://098d23cc708c34c559264cad6def2d069277b86bc34c91737dd112dc4b6b81ec
image: secretflow-registry.cn-hangzhou.cr.aliyuncs.com/secretflow/secretflow-lite-anolis8:1.7.0b0
imageID: sha256:96f7618d2c8e4c923e41451baa72cadbb9bfd1f365f4695e0beb31589b566d19
lastState: {}
name: secretflow
ready: false
restartCount: 0
started: false
state:
terminated:
containerID: containerd://098d23cc708c34c559264cad6def2d069277b86bc34c91737dd112dc4b6b81ec
exitCode: 137
finishedAt: "2024-07-17T02:34:55Z"
message: |
WARNING:root:Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.
2024-07-17 02:34:44,535|alice|INFO|secretflow|entry.py:start_ray:59| ray_conf: RayConfig(ray_node_ip_address='secretflow-task-20240717103430-single-psi-0-global.alice.svc', ray_node_manager_port=27494, ray_object_manager_port=27495, ray_client_server_port=27490, ray_worker_ports=[], ray_gcs_port=27493)
2024-07-17 02:34:44,535|alice|INFO|secretflow|entry.py:start_ray:67| Trying to start ray head node at secretflow-task-20240717103430-single-psi-0-global.alice.svc, start command: ray start --head --include-dashboard=false --disable-usage-stats --num-cpus=32 --node-ip-address=secretflow-task-20240717103430-single-psi-0-global.alice.svc --port=27493 --node-manager-port=27494 --object-manager-port=27495 --ray-client-server-port=27490
reason: OOMKilled
startedAt: "2024-07-17T02:34:33Z"
hostIP: 172.18.0.6
phase: Failed
startTime: "2024-07-17T02:34:32Z"
[root@ecs-46f7 ~]# docker ps
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
d33a6d3d1637 secretflow-registry.cn-hangzhou.cr.aliyuncs.com/secretflow/kuscia:0.9.0b0 "tini -- bin/kuscia …" 40 hours ago Up 17 hours 0.0.0.0:13082->80/tcp, :::13082->80/tcp, 0.0.0.0:10081->1080/tcp, :::10081->1080/tcp, 0.0.0.0:40805->8082/tcp, :::40805->8082/tcp, 0.0.0.0:40804->8083/tcp, :::40804->8083/tcp root-kuscia-autonomy-bob
27965ded9beb secretflow-registry.cn-hangzhou.cr.aliyuncs.com/secretflow/kuscia:0.9.0b0 "tini -- bin/kuscia …" 41 hours ago Up 17 hours 0.0.0.0:13081->80/tcp, :::13081->80/tcp, 0.0.0.0:10080->1080/tcp, :::10080->1080/tcp, 0.0.0.0:40802->8082/tcp, :::40802->8082/tcp, 0.0.0.0:40803->8083/tcp, :::40803->8083/tcp root-kuscia-autonomy-alice
[root@ecs-46f7 ~]# free -m
total used free shared buff/cache available
Mem: 15760 1779 12094 16 1886 11802
Swap: 0 0 0
我这边找台机器试下,你那边如果有资源的话,可以先在两台机器上面分别布置一个节点来进行测试,一般我们如果在单机上面部署多节点的话,推荐系统内存是大于16G的
CONTAINER ID NAME CPU % MEM USAGE / LIMIT MEM % NET I/O BLOCK I/O PIDS
e7d0f30ad94b root-kuscia-autonomy-bob 3.73% 943.6MiB / 6GiB 15.36% 1.99MB / 1.92MB 217MB / 1.39GB 107
12b6b609da9d root-kuscia-autonomy-alice 2.87% 975.2MiB / 6GiB 15.87% 1.92MB / 1.99MB 34.7MB / 1.92GB 116
^C
[root@iZbp143l1lire20uffx9t5Z data]# df -h
Filesystem Size Used Avail Use% Mounted on
devtmpfs 7.8G 0 7.8G 0% /dev
tmpfs 7.8G 0 7.8G 0% /dev/shm
tmpfs 7.8G 13M 7.8G 1% /run
tmpfs 7.8G 0 7.8G 0% /sys/fs/cgroup
/dev/nvme0n1p2 40G 28G 9.5G 75% /
/dev/nvme0n1p1 191M 9.8M 182M 6% /boot/efi
tmpfs 1.6G 0 1.6G 0% /run/user/0
overlay 40G 28G 9.5G 75% /var/lib/docker/overlay2/7d786a32340c885cdbedcfe427cb205bdcbb8c265615b2cc8deb7b29f125022d/merged
overlay 40G 28G 9.5G 75% /var/lib/docker/overlay2/cdea1e7cca9eaafbaf643a03f956c5a654eecab3dba6caba5862fc2ae99f2c07/merged
[root@iZbp143l1lire20uffx9t5Z data]# free -m
total used free shared buff/cache available
Mem: 15906 1923 11298 12 2684 11632
Swap: 0 0 0
[root@iZbp143l1lire20uffx9t5Z data]# docker exec -it ${USER}-kuscia-autonomy-alice kubectl get kj -n cross-domain
NAME STARTTIME COMPLETIONTIME LASTRECONCILETIME PHASE
secretflow-task-20240717131307 11m 11m 11m Succeeded
secretflow-task-20240717132151 3m 2m30s 2m30s Succeeded
您好,这边在当前环境中测试是正常的,建议您排查系统中是否有其他进程正在运行导致内存占用的情况,也可以按照官网教程再尝试下呢。
有没有可能是因为我们是arm系统的原因呢?
目前是支持arm的,(上面我发的测试结果也是在arm环境下执行的:Linux iZbp143l1lire20uffx9t5Z 4.18.0-348.20.1.el7.aarch64 #1 SMP Wed Apr 13 20:57:50 UTC 2022 aarch64 aarch64 aarch64 GNU/Linux)。 升级下docker版本再试下呢,我这边用的是 26.1.4
我这边重启了一下 docker, 可以了,非常感谢~
Issue Type
Install/Deploy
Search for existing issues similar to yours
Yes
OS Platform and Distribution
Linux ecs-46f7 4.19.90-17.5.ky10.aarch64 #1 SMP Fri Aug 7 13:35:33 CST 2020 aarch64 aarch64 aarch64 GNU/Linux
Kuscia Version
0.9.0b0
Deployment
docker
deployment Version
24.0.8
App Running type
secretflow
App Running version
1.7.0b0
Configuration file used to run kuscia.
What happend and What you expected to happen.