# TODO enable autoscaling
apiVersion: ray.io/v1
kind: RayCluster
metadata:
name: kubeflow-raycluster
spec:
rayVersion: '2.23.0'
# Ray head pod configuration
headGroupSpec:
# Kubernetes Service Type.
serviceType: ClusterIP
# The following params are used to complete the ray start: ray start --head --block --dashboard-host: '0.0.0.0' ...
rayStartParams:
dashboard-host: '0.0.0.0'
block: 'true'
# pod template
template:
metadata:
# Custom labels. NOTE: To avoid conflicts with KubeRay operator, do not define custom labels start with `raycluster`.
# Refer to https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/
# The ray head must not have an Istio sidecar
# TODO add an authorizationpolicy in the future for the ray head
labels:
sidecar.istio.io/inject: "false"
spec:
containers:
- name: ray-head
image: rayproject/ray:2.23.0-py311-cpu
ports:
- containerPort: 6379
name: gcs
- containerPort: 8265
name: dashboard
- containerPort: 10001
name: client
lifecycle:
preStop:
exec:
command: ["/bin/sh","-c","ray stop"]
volumeMounts:
- mountPath: /tmp/ray
name: ray-logs
# The resource requests and limits in this config are too small for production!
# It is better to use a few large Ray pod than many small ones.
# For production, it is ideal to size each Ray pod to take up the
# entire Kubernetes node on which it is scheduled.
resources:
limits:
cpu: "1"
memory: "2G"
requests:
cpu: "100m"
memory: "2G"
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
runAsNonRoot: true
seccompProfile:
type: RuntimeDefault
volumes:
- name: ray-logs
emptyDir: {}
workerGroupSpecs:
# the pod replicas in this group typed worker
- replicas: 1
minReplicas: 1
maxReplicas: 10
# logical group name, for this called small-group, also can be functional
groupName: small-group
rayStartParams:
block: 'true'
#pod template
template:
metadata:
labels:
# Disable the sidecars for the ray wokers
# TODO add an authorizationpolicy in the future for the ray worker
sidecar.istio.io/inject: "false"
spec:
containers:
- name: ray-worker
image: rayproject/ray:2.23.0-py311-cpu
lifecycle:
preStop:
exec:
command: ["/bin/sh","-c","ray stop"]
# use volumeMounts.Optional.
# Refer to https://kubernetes.io/docs/concepts/storage/volumes/
volumeMounts:
- mountPath: /tmp/ray
name: ray-logs
# The resource requests and limits in this config are too small for production!
# It is better to use a few large Ray pod than many small ones.
# For production, it is ideal to size each Ray pod to take up the
# entire Kubernetes node on which it is scheduled.
resources:
limits:
cpu: "1"
memory: "1G"
requests:
cpu: "300m"
memory: "1G"
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
runAsNonRoot: true
seccompProfile:
type: RuntimeDefault
initContainers:
# the env var $RAY_IP is set by the operator if missing, with the value of the head service name
- name: init
image: busybox:1.36
# Change the cluster postfix if you don't have a default setting
command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for K8s Service $RAY_IP; sleep 2; done"]
securityContext:
runAsUser: 1000
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
runAsNonRoot: true
seccompProfile:
type: RuntimeDefault
# use volumes
# Refer to https://kubernetes.io/docs/concepts/storage/volumes/
volumes:
- name: ray-logs
emptyDir: {}
but we should start with a clean CR from scratch with autoscaling etc.
Validation Checklist
Version
master
Describe your issue
Check the ray examples
Steps to reproduce the issue
The following is better than https://github.com/kubeflow/manifests/blob/master/contrib/ray/raycluster_example.yaml
but we should start with a clean CR from scratch with autoscaling etc.
Put here any screenshots or videos (optional)
No response