build(deps): Upgrade accelerate requirement to allow version 1.0.0

willmj commented 1 month ago

Description of the change

Updates the requirements on accelerate to permit the latest version. Accelerate version 1.0.0 release notes

Variable --fsdp_backward_prefetch_policy became outdated, replaced with new variable --fsdp_backward_prefetch to fix unit tests.

Related issue number

closes #372

How to verify the PR

Was the PR tested

[ ] I have added >=1 unit test(s) for every new method I have added.
[x] I have ensured all unit tests pass

github-actions[bot] commented 1 month ago

Thanks for making a pull request! 😃 One of the maintainers will review and advise on the next steps.

willmj commented 1 month ago

Multi GPU tuning and inference works

Configs:

Tuning: ``` apiVersion: v1 kind: ConfigMap metadata: name: sft-trainer-config-allam data: config.json: | { "model_name_or_path": "/fmaas-model-pvc/ibm_models/allam-beta-13b-chat", "training_data_path": "/testing/tuning/input/twitter-complaints.json", "output_dir": "/testing/tuning/output/allam-13b-chat/lora/twitter-test-1234", "num_train_epochs": 1.0, "per_device_train_batch_size": 2, "gradient_accumulation_steps": 1, "learning_rate": 1e-5, "response_template": "\n### Label:", "dataset_text_field": "output", "peft_method": "lora", "r": 8, "lora_dropout": 0.05, "lora_alpha": 16, "embedding_size_multiple_of": 1, "lora_post_process_for_vllm": true } --- apiVersion: v1 kind: Pod metadata: name: will-sft-trainer-allam-13b-lora spec: securityContext: runAsUser: 1000 runAsGroup: 0 fsGroup: 1000 fsGroupChangePolicy: "OnRootMismatch" containers: - env: - name: SFT_TRAINER_CONFIG_JSON_PATH value: /config/config.json - name: LOG_LEVEL value: DEBUG image: docker-na-public.artifactory.swg-devops.com/wcp-ai-foundation-team-docker-virtual/sft-trainer:7f8ace1_ubi9_py311.main imagePullPolicy: IfNotPresent command: [ "/bin/bash", "-c", "--" ] args: [ "while true; do sleep 30; done;" ] name: train-conductor-training resources: limits: nvidia.com/gpu: "2" memory: 200Gi cpu: "10" ephemeral-storage: 2Ti requests: nvidia.com/gpu: "2" memory: 80Gi cpu: "5" ephemeral-storage: 1600Gi volumeMounts: - mountPath: /testing name: testing-bucket - mountPath: /llama_eval name: llama-eval-pvc readOnly: true - mountPath: /fmaas-model-pvc name: fmaas-model-pvc readOnly: true - mountPath: /granite name: granite-pvc readOnly: true - mountPath: /config name: sft-trainer-config imagePullSecrets: - name: artifactory-docker-anh restartPolicy: Never terminationGracePeriodSeconds: 30 volumes: - name: testing-bucket persistentVolumeClaim: claimName: fmaas-integration-tests-pvc - name: llama-eval-pvc persistentVolumeClaim: claimName: llama-eval-pvc - name: fmaas-model-pvc persistentVolumeClaim: claimName: fmaas-model-pvc - name: granite-pvc persistentVolumeClaim: claimName: granite-pvc - name: sft-trainer-config configMap: name: sft-trainer-config-allam ``` Inference: ``` apiVersion: v1 kind: Service metadata: labels: app: text-gen-allam-13b name: allam-13b-inference-server spec: clusterIP: None ports: - name: grpc port: 8033 targetPort: grpc selector: app: text-gen-allam-13b type: ClusterIP --- apiVersion: apps/v1 kind: Deployment metadata: labels: app: text-gen-allam-13b component: fmaas-inference-server name: allam-13b-inference-server spec: replicas: 1 selector: matchLabels: app: text-gen-allam-13b component: fmaas-inference-server strategy: rollingUpdate: maxSurge: 1 template: metadata: annotations: prometheus.io/path: /metrics/ prometheus.io/port: "3000" prometheus.io/scrape: "true" labels: app: text-gen-allam-13b component: fmaas-inference-server spec: affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: - key: nvidia.com/gpu.product operator: In values: - NVIDIA-A100-SXM4-80GB containers: - env: - name: MODEL_NAME value: "/fmaas-model-pvc/ibm_models/allam-beta-13b-chat" - name: OUTPUT_SPECIAL_TOKENS value: "true" - name: MAX_NEW_TOKENS value: "4096" - name: DEPLOYMENT_FRAMEWORK value: tgis_native - name: FLASH_ATTENTION value: "true" - name: NUM_GPUS value: "1" - name: CUDA_VISIBLE_DEVICES value: "0" - name: PORT value: "3000" - name: MAX_LOG_LEN value: "100" - name: ENABLE_LORA value: "true" - name: ADAPTER_CACHE value: "/testing/tuning/output/allam-13b-chat/lora/twitter-test-1234" # had to update from shared_model_storage so was writeable for model from HF - name: HF_HUB_CACHE value: /tmp - name: TRANSFORMERS_CACHE value: $(HF_HUB_CACHE) # The below values may vary by model, this is taken for granite-13b - name: MAX_BATCH_SIZE value: "256" - name: MAX_CONCURRENT_REQUESTS value: "256" # Below is added for granite-3b-code-instruct model # - name: VLLM_ATTENTION_BACKEND # value: XFORMERS # to download model from HF add below # - name: HF_HUB_OFFLINE # value: "0" image: quay.io/opendatahub/vllm:fast-ibm-0158e7c imagePullPolicy: IfNotPresent livenessProbe: failureThreshold: 3 httpGet: path: /health port: http scheme: HTTP periodSeconds: 100 successThreshold: 1 timeoutSeconds: 8 name: server ports: - containerPort: 3000 name: http protocol: TCP - containerPort: 8033 name: grpc protocol: TCP readinessProbe: failureThreshold: 3 httpGet: path: /health port: http scheme: HTTP periodSeconds: 30 successThreshold: 1 timeoutSeconds: 5 # resources will vary by model -- taken for granite-13b resources: limits: cpu: "8" memory: 48Gi nvidia.com/gpu: "1" requests: cpu: "4" securityContext: allowPrivilegeEscalation: false capabilities: drop: - ALL privileged: false runAsNonRoot: true seccompProfile: type: RuntimeDefault startupProbe: failureThreshold: 24 httpGet: path: /health port: http scheme: HTTP periodSeconds: 30 successThreshold: 1 terminationMessagePath: /dev/termination-log terminationMessagePolicy: FallbackToLogsOnError volumeMounts: - mountPath: /granite name: ibm-granite-pvc readOnly: true - name: llama-eval-pvc mountPath: /llama readOnly: true - name: llama-3-pvc mountPath: /llama3 readOnly: true # - mountPath: /data # name: fms-tuning # readOnly: true - mountPath: /testing name: fmaas-integration-tests readOnly: true - mountPath: /fmaas-model-pvc name: fmaas-model-pvc readOnly: true dnsPolicy: ClusterFirst enableServiceLinks: false # imagePullSecrets: # - name: artifactory-docker volumes: - name: ibm-granite-pvc persistentVolumeClaim: claimName: ibm-granite-pvc - name: llama-eval-pvc persistentVolumeClaim: claimName: llama-eval-pvc - name: llama-3-pvc persistentVolumeClaim: claimName: llama-3-pvc # - name: fms-tuning # persistentVolumeClaim: # claimName: fms-tuning-pvc - name: fmaas-integration-tests persistentVolumeClaim: claimName: fmaas-integration-tests - name: fmaas-model-pvc persistentVolumeClaim: claimName: fmaas-model-pvc ```

foundation-model-stack / fms-hf-tuning