#!/usr/bin/bash

# map DGX1-100a HCA to core, so that processes have affinity to the HCA they are assigned.
# Assume we only run # of ranks that corresponds to the number of HCAs

# cat  /sys/class/infiniband/mlx5_*/device/local_cpulist

# GPU     HCA     NUMA    CPU
#
# GPU2    mlx5_2  1   16-31,144-15
# GPU3    mlx5_3  1   16-31,144-15
# GPU0    mlx5_0  3   48-63,176-191
# GPU1    mlx5_1  3   48-63,176-191
#
# GPU6    mlx5_8  5   80-95,208-223
# GPU7    mlx5_9  5   80-95,208-223
# GPU4    mlx5_6  7   112-127,240-255
# GPU5    mlx5_7  7   112-127,240-255

#         mlx5_4
#         mlx5_5
# 
# NUMA node0 CPU(s):   0-15  ,128-143
# NUMA node2 CPU(s):   32-47 ,160-175
# NUMA node4 CPU(s):   64-79 ,192-207
# NUMA node6 CPU(s):   96-111,224-239

# NUMA node1 CPU(s):   16-31,  144-159
# NUMA node3 CPU(s):   48-63,  176-191
# NUMA node5 CPU(s):   80-95,  208-223
# NUMA node7 CPU(s):   112-127,240-255


case $OMPI_COMM_WORLD_LOCAL_RANK in
      0) core=48,176;    export UCX_NET_DEVICES=mlx5_0:1 ;;
      1) core=49,177;    export UCX_NET_DEVICES=mlx5_1:1 ;;
      2) core=16,144;    export UCX_NET_DEVICES=mlx5_2:1 ;;
      3) core=17,145;    export UCX_NET_DEVICES=mlx5_3:1 ;;

      # These are not necessarily correct:
      # ---------------------------------------------------
      4) core=0,128;     export UCX_NET_DEVICES=mlx5_4:1 ;;
      5) core=1,129;     export UCX_NET_DEVICES=mlx5_5:1 ;;
      # ---------------------------------------------------

      6) core=112,240;   export UCX_NET_DEVICES=mlx5_6:1 ;;
      7) core=113,241;   export UCX_NET_DEVICES=mlx5_7:1 ;;
      8) core=80,208;    export UCX_NET_DEVICES=mlx5_8:1 ;;
      9) core=81,209;    export UCX_NET_DEVICES=mlx5_9:1 ;;
esac

      taskset -c $core $*

