---
# yaml-language-server: $schema=config.schema.json
# name of the cluster
project_name: az-hop
# azure location name as returned by the command : az account list-locations -o table
location: eastus
# Name of the resource group to create all resources
resource_group: azhop
# If using an existing resource group set to true. Default is false
# When using an existing resource group make sure the location match the one of the existing resource group
use_existing_rg: false
# If set to true, will disable telemetry for azhop. See https://azure.github.io/az-hop/deploy/telemetry.html.
#optout_telemetry: true
# A log analytics workspace can be created for monitoring or alerting
# Alternatively, you can use an existing workspace.
# To use an existing workspace set create to false and specify the resource group, name and subscription the target workspace lives in
log_analytics:
create: true
# An existing log analytics workspace can be used instead. The resource group, name and subscription id of the workspace will need to be specified.
# Grant the role "Log Analytics Contributor" on the target Log Analytics Workspace for the identity used to deploy az-hop
#resource_group:
#name:
#subscription_id: # Optional, if not specified the current subscription will be used
# Option to install the monitoring agent on static infra VMs. Can be disabled if the agent is installed by policy.
monitoring:
install_agent: true
#If set to true, it will create alert rules associated with az-hop. Enablement of alerting will require the specification of an admin email to send alerts to.
alerting:
enabled: true
admin_email: maxi.aguirre@nextira.com
local_volume_threshold: 80
# Additional tags to be added on the Resource Group
tags:
env: dev
# Define an Azure Netapp Files (ANF) account, single pool and volume
# If not present, assume that there is an existing NFS share for the users home directory
anf:
create: true
# Size of the ANF pool and unique volume (min: 4TB, max: 100TB)
homefs_size_tb: 4
# Service level of the ANF volume, can be: Standard, Premium, Ultra
homefs_service_level: Standard
# dual protocol
dual_protocol: false # true to enable SMB support. false by default
# If alerting is enabled, this value will be used to determine when to trigger alerts
alert_threshold: 80 # alert when ANF volume reaches this threshold
# For small deployments you can use Azure Files instead of ANF for the home directory
azurefiles:
create: false
size_gb: 1024
# These mounts will be listed in the Files menu of the OnDemand portal and automatically mounted on all compute nodes and remote desktop nodes
mounts:
# mount settings for the user home directory
home: # This home name can't be changed
# type of mount : existing, anf or azurefiles, default to existing. One of the three should be defined in order to mount the home directory
# When using existing, the mountpoint, server, export and options should be defined, for other cases leave the values as defined with the curly braces
type: anf
mountpoint: /anfhome # /sharedhome for example
server: '{{anf_home_ip}}' # Specify an existing NFS server name or IP, when using the ANF built in use '{{anf_home_ip}}'
export: '{{anf_home_path}}' # Specify an existing NFS export directory, when using the ANF built in use '{{anf_home_path}}'
options: '{{anf_home_opts}}' # Specify the mount options. Default to rw,hard,rsize=262144,wsize=262144,vers=3,tcp,_netdev
# mount1:
# mountpoint: /mount1
# server: a.b.c.d # Specify an existing NFS server name or IP
# export: myexport1 # Specify an existing NFS export name
# options: my_options # Specify the mount options.
# name of the admin account
admin_user: clusteradmin
# List of identities (object ids) to grant read access to az-hop key vault (optional)
# key_vault_readers:
# Network
network:
# Create Network and Application Security Rules, true by default, false when using an existing VNET if not specified
create_nsg: true
vnet:
name: hpcvnet # Optional - default to hpcvnet
#id: # If a vnet id is set then no network will be created and the provided vnet will be used
address_space: "10.0.0.0/23"
# Special VNET Tags
# tags:
# key1: value1
# When using an existing VNET, only the subnet names will be used and not the adress_prefixes
subnets: # all subnets are optionals
# name values can be used to rename the default to specific names, address_prefixes to change the IP ranges to be used
# All values below are the default values
frontend:
name: frontend
address_prefixes: "10.0.0.0/29"
create: true # create the subnet if true. default to true when not specified, default to false if using an existing VNET when not specified
admin:
name: admin
address_prefixes: "10.0.0.16/28"
create: true
netapp:
name: netapp
address_prefixes: "10.0.0.32/28"
create: true
# the outbounddns is optional and only when deploying an Azure Private DNS Resolver
# outbounddns:
# name: outbounddns
# address_prefixes: "10.0.0.48/28"
# create: true
ad:
name: ad
address_prefixes: "10.0.0.8/29"
create: true
# Bastion and Gateway subnets are optional and can be added if a Bastion or a VPN need to be created in the environment
# bastion: # Bastion subnet name is always fixed to AzureBastionSubnet
# address_prefixes: "10.0.0.64/26" # CIDR minimal range must be /26
# create: true
# gateway: # Gateway subnet name is always fixed to GatewaySubnet
# address_prefixes: "10.0.0.128/27" # Recommendation is to use /27 or /26 network
# create: true
compute:
name: compute
address_prefixes: "10.0.1.0/24"
create: true
# Specify the Application Security Groups mapping if already existing
# asg:
# resource_group: # name of the resource group containing the ASG. Default to the resource group containing azhop resources
# names: # list of ASG names mapping to the one defined in az-hop
# asg-ssh: asg-ssh
# asg-rdp: asg-rdp
# asg-jumpbox: asg-jumpbox
# asg-ad: asg-ad
# asg-ad-client: asg-ad-client
# asg-lustre: asg-lustre
# asg-lustre-client: asg-lustre-client
# asg-pbs: asg-pbs
# asg-pbs-client: asg-pbs-client
# asg-cyclecloud: asg-cyclecloud
# asg-cyclecloud-client: asg-cyclecloud-client
# asg-nfs-client: asg-nfs-client
# asg-telegraf: asg-telegraf
# asg-grafana: asg-grafana
# asg-robinhood: asg-robinhood
# asg-ondemand: asg-ondemand
# asg-deployer: asg-deployer
# asg-guacamole: asg-guacamole
# asg-mariadb-client: asg-mariadb-client
# peering: # This list is optional, and can be used to create VNet Peerings in the same subscription.
# - vnet_name: #"VNET Name to Peer to"
# vnet_resource_group: #"Resource Group of the VNET to peer to"
# vnet_allow_gateway: false # optional: allow gateway transit (default: true)
# Specify DNS forwarders available in the network
# dns:
# forwarders:
# - { name: foo.com, ips: "10.2.0.4, 10.2.0.5" }
# When working in a locked down network, uncomment and fill out this section
locked_down_network:
enforce: false
# grant_access_from: [a.b.c.d] # Array of CIDR to grant access from, see https://docs.microsoft.com/en-us/azure/storage/common/storage-network-security?tabs=azure-portal#grant-access-from-an-internet-ip-range
public_ip: true # Enable public IP creation for Jumpbox, OnDemand and create images. Default to true
# Base image configuration. Can be either an image reference or an image_id from the image registry or a custom managed image
linux_base_image: "OpenLogic:CentOS:7_9-gen2:latest" # publisher:offer:sku:version or image_id
# linux image plan if required, format is publisher:product:name
#linux_base_plan:
windows_base_image: "MicrosoftWindowsServer:WindowsServer:2019-Datacenter-smalldisk:latest" # publisher:offer:sku:version or image_id
lustre_base_image: "azhpc:azurehpc-lustre:azurehpc-lustre-2_12:latest"
# The lustre plan to use. Only needed when using the default lustre image from the marketplace. use "::" for an empty plan
lustre_base_plan: "azhpc:azurehpc-lustre:azurehpc-lustre-2_12" # publisher:product:name
# Jumpbox VM configuration, only needed when deploying thru a public IP and without a configured deployer VM
jumpbox:
vm_size: Standard_B2ms
# SSH port under which the jumpbox SSH server listens on the public IP. Default to 22
# Change this to, e.g., 2222, if security policies (like "zero trust") in your tenant automatically block access to port 22 from the internet
#ssh_port: 2222
# Active directory VM configuration
ad:
vm_size: Standard_B2ms
hybrid_benefit: false # Enable hybrid benefit for AD, default to false
high_availability: false # Build AD in High Availability mode (2 Domain Controlers) - default to false
# On demand VM configuration
ondemand:
vm_size: Standard_D4s_v5
#fqdn: azhop.foo.com # When provided it will be used for the certificate server name
generate_certificate: true # Generate an SSL certificate for the OnDemand portal. Default to true
# Grafana VM configuration
grafana:
vm_size: Standard_B2ms
# Guacamole VM configuration
guacamole:
vm_size: Standard_B2ms
# Scheduler VM configuration
scheduler:
vm_size: Standard_B2ms
# CycleCloud VM configuration
cyclecloud:
vm_size: Standard_B2ms
# version: 8.3.0-3062 # to specify a specific version, see https://packages.microsoft.com/yumrepos/cyclecloud/
# Lustre cluster is optional and can be used to create a Lustre cluster in the environment.
lustre:
create: false # true or false to create a lustre cluster
rbh_sku: "Standard_D8d_v4"
mds_sku: "Standard_D8d_v4"
oss_sku: "Standard_D32d_v4"
# oss_count: 1
hsm_max_requests: 8
mdt_device: "/dev/sdb"
ost_device: "/dev/sdb"
# optional to use existing storage for the archive
# if not included it will use the azhop storage account that is created
# hsm:
# storage_account: #existing_storage_account_name
# storage_container: #only_used_with_existing_storage_account
# List of users to be created on this environment
users:
# name: username - must be less than 20 characters
# uid: uniqueid
# shell: /bin/bash # default to /bin/bash
# home: /anfhome/<user_name> # default to /homedir_mountpoint/user_name
# groups: list of groups the user belongs to
- { name: hpcadmin, uid: 10001, groups: [5001, 5002] }
- { name: hpcuser, uid: 10002 }
# - { name: user1, uid: 10003, groups: [6000] }
# - { name: user2, uid: 10004, groups: [6001] }
usergroups:
# These group names could be changed but not the gids as names will be mapped by gids
- name: azhop-users # All users will be added to this group by default
gid: 5000
- name: azhop-admins
gid: 5001
description: "For users with azhop admin privileges"
- name: azhop-localadmins
gid: 5002
description: "For users with sudo right or local admin right on nodes"
# For custom groups use gid >= 6000
# - name: project1 # For project1 users
# gid: 6000
# - name: project2 # For project2 users
# gid: 6001
# Enable cvmfs-eessi - disabled by default
cvmfs_eessi:
enabled: false
# scheduler to be installed and configured (openpbs, slurm)
queue_manager: slurm
# Specific SLURM configuration
slurm:
# Enable SLURM accounting, this will create a SLURM accounting database in a managed MariaDB server instance
accounting_enabled: false
# SLURM version to install. Currently supported: only 20.11.9 and 22.05.3.
# Other versions can be installed by building from source (See build_rpms setting in the slurmserver role)
slurm_version: 20.11.9
# Name of the SLURM cluster for accounting (optional, default to 'slurm')
# WARNING: changing this value on a running cluster will cause slurmctld to fail to start. This is a
# safety check to prevent accounting errors. To override, remove /var/spool/slurmd/clustername
cluster_name: slurm_azhop
enroot:
enroot_version: 3.4.1
# If using an existing Managed MariaDB instance for SLURM accounting and/or Guacamole, specify these values
#database:
# Admin user of the database for which the password will be retrieved from the azhop keyvault
#user: sqladmin
# FQDN of the managed instance
#fqdn:
# IP of the managed private endpoint if the FQDN is not registered in a private DNS
#ip:
# Create a Bastion in the bastion subnet when defined
bastion:
create: false
# Create a VPN Gateway in the gateway subnet when specified
vpn_gateway:
create: false
# Authentication configuration for accessing the az-hop portal
# Default is basic authentication. For oidc authentication you have to specify the following values
# The OIDCClient secret need to be stored as a secret named <oidc-client-id>-password in the keyvault used by az-hop
authentication:
httpd_auth: basic # oidc or basic
# User mapping https://osc.github.io/ood-documentation/latest/reference/files/ood-portal-yml.html#ood-portal-generator-user-map-match
# You can specify either a map_match or a user_map_cmd
# Domain users are mapped to az-hop users with the same name and without the domain name
# user_map_match: '^([^@]+)@mydomain.foo$'
# If using a custom mapping script, update it from the ./playbooks/files directory before running the playbook
# user_map_cmd: /opt/ood/ood_auth_map/bin/custom_mapping.sh
# ood_auth_openidc:
# OIDCProviderMetadataURL: # for AAD use 'https://sts.windows.net/{{tenant_id}}/.well-known/openid-configuration'
# OIDCClientID: 'XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX'
# OIDCRemoteUserClaim: # for AAD use 'upn'
# OIDCScope: # for AAD use 'openid profile email groups'
# OIDCPassIDTokenAs: # for AAD use 'serialized'
# OIDCPassRefreshToken: # for AAD use 'On'
# OIDCPassClaimsAs: # for AAD use 'environment'
image_gallery:
create: false # Create the shared image gallery to store custom images
# List of images to be defined
images:
# - name: image_definition_name # Should match the packer configuration file name, one per packer file
# publisher: azhop
# offer: CentOS
# sku: 7_9-gen2
# hyper_v: V2 # V1 or V2 (V1 is the default)
# os_type: Linux # Linux or Windows
# version: 7.9 # Version of the image to create the image definition in SIG. Pattern is major.minor where minor is mandatory
# Pre-defined images
- name: azhop-compute-almalinux-8_7
publisher: azhop
offer: almalinux
sku: 8_7-hpc-gen2
hyper_v: V2
os_type: Linux
version: 8.7
- name: azhop-centos79-v2-rdma-gpgpu
publisher: azhop
offer: CentOS
sku: 7.9-gen2
hyper_v: V2
os_type: Linux
version: 7.9
# Image definition when using a custom image to build compute nodes images
- name: azhop-centos79-v2-rdma-ci
publisher: azhop
offer: CentOS
sku: 7.9-gen2-ci
hyper_v: V2
os_type: Linux
version: 7.9
# Image definition when using a custom image to build remote viz nodes images
- name: azhop-centos79-desktop3d-ci
publisher: azhop
offer: CentOS
sku: 7.9-gen2-desktop3d-ci
hyper_v: V2
os_type: Linux
version: 7.9
- name: azhop-centos79-desktop3d
publisher: azhop
offer: CentOS
sku: 7.9-gen2-desktop3d
hyper_v: V2
os_type: Linux
version: 7.9
- name: azhop-compute-centos-7_9
publisher: azhpc
offer: azhop-compute
sku: centos-7_9
hyper_v: V2
os_type: Linux
version: 7.9
- name: azhop-desktop-centos-7_9
publisher: azhpc
offer: azhop-desktop
sku: centos-7_9
hyper_v: V2
os_type: Linux
version: 7.9
- name: azhop-compute-ubuntu-1804
publisher: azhpc
offer: azhop-compute
sku: ubuntu-1804
hyper_v: V2
os_type: Linux
version: 18.04
- name: azhop-win10
publisher: azhop
offer: Windows-10
sku: 21h1-pron
hyper_v: V1
os_type: Windows
version: 10.19043
# Base image when building your own HPC image and not using the HPC marketplace images
- name: base-centos79-v2-rdma
publisher: azhop
offer: CentOS
sku: 7.9-gen2-rdma-nogpu
hyper_v: V2
os_type: Linux
version: 7.9
# Autoscale default settings for all queues, can be overriden on each queue depending on the VM type if needed
autoscale:
idle_timeout: 1800 # Idle time in seconds before shutting down VMs - default to 1800 like in CycleCloud
# List of queues (node arrays in Cycle) to be defined
# don't use queue names longer than 8 characters in order to leave space for node suffix, as hostnames are limited to 15 chars due to domain join and NETBIOS constraints.
queues:
- name: execute # name of the Cycle Cloud node array
# Azure VM Instance type
vm_size: Standard_F2s_v2
# maximum number of cores that can be instanciated
max_core_count: 1024
# Use the pre-built azhop image from the marketplace
image: azhpc:azhop-compute:centos-7_9:latest
# Use this image ID when building your own custom images
#image: /subscriptions/{{subscription_id}}/resourceGroups/{{resource_group}}/providers/Microsoft.Compute/galleries/{{sig_name}}/images/azhop-centos79-v2-rdma-gpgpu/latest
# Image plan specification (when needed for the image). Terms must be accepted prior to deployment
# plan: publisher:product:name
# Set to true if AccelNet need to be enabled. false is the default value
EnableAcceleratedNetworking: false
# spot instance support. Default is false
spot: false
# Set to false to disable creation of placement groups (for SLURM only). Default is true
ColocateNodes: false
# Specific idle time in seconds before shutting down VMs, make sure it's lower than autoscale.idle_timeout
idle_timeout: 300
# Set the max number of vm's in a VMSS; requires additional limit raise through support ticket for >100;
# 100 is default value; lower numbers will improve scaling for single node jobs or jobs with small number of nodes
MaxScaleSetSize: 100
- name: hc44rs
vm_size: Standard_HC44rs
max_core_count: 440
image: azhpc:azhop-compute:centos-7_9:latest
spot: true
EnableAcceleratedNetworking: true
- name: hb120v2
vm_size: Standard_HB120rs_v2
max_core_count: 1200
image: azhpc:azhop-compute:centos-7_9:latest
spot: true
EnableAcceleratedNetworking: true
- name: hb120v3
vm_size: Standard_HB120rs_v3
max_core_count: 1200
image: azhpc:azhop-compute:centos-7_9:latest
spot: true
EnableAcceleratedNetworking: true
# Queue dedicated to GPU remote viz nodes. This name is fixed and can't be changed
- name: viz3d
vm_size: Standard_NV12s_v3
max_core_count: 48
# Use the pre-built azhop image from the marketplace
image: azhpc:azhop-desktop:centos-7_9:latest
# Use this image ID when building your own custom images
#image: /subscriptions/{{subscription_id}}/resourceGroups/{{resource_group}}/providers/Microsoft.Compute/galleries/{{sig_name}}/images/azhop-centos79-desktop3d/latest
ColocateNodes: false
spot: false
EnableAcceleratedNetworking: true
max_hours: 12 # Maximum session duration
min_hours: 1 # Minimum session duration - 0 is infinite
# Queue dedicated to share GPU remote viz nodes. This name is fixed and can't be changed
- name: largeviz3d
vm_size: Standard_NV48s_v3
max_core_count: 96
image: azhpc:azhop-desktop:centos-7_9:latest
ColocateNodes: false
EnableAcceleratedNetworking: true
spot: false
max_hours: 12
min_hours: 1
# Queue dedicated to non GPU remote viz nodes. This name is fixed and can't be changed
- name: viz
vm_size: Standard_D8s_v5
max_core_count: 200
image: azhpc:azhop-desktop:centos-7_9:latest
ColocateNodes: false
spot: false
EnableAcceleratedNetworking: true
max_hours: 12
min_hours: 1
# Remote Visualization definitions
enable_remote_winviz: false # Set to true to enable windows remote visualization
remoteviz:
- name: winviz # This name is fixed and can't be changed
vm_size: Standard_NV12s_v3 # Standard_NV8as_v4 Only NVsv3 and NVsV4 are supported
max_core_count: 48
image: "MicrosoftWindowsDesktop:Windows-10:21h1-pron:latest"
ColocateNodes: false
spot: false
EnableAcceleratedNetworking: true
# Application settings
applications:
bc_codeserver:
enabled: true
bc_jupyter:
enabled: true
bc_amlsdk:
enabled: true
bc_rstudio:
enabled: true
bc_ansys_workbench:
enabled: true
bc_vmd:
enabled: true
bc_paraview:
enabled: true
bc_vizer:
enabled: true
Version
v1.0.35
In what area(s)?
/area ansible
Expected Behavior
Grafana can be logged in from the OOD dashboard
Actual Behavior
Login fails
Steps to Reproduce the Problem
Access grafana after deployment.
Config.yml file used: