#!/usr/bin/bash -eE

echo

usage () {
cat <<HELP_USAGE

$0 [options] <parameters>
		
	Parameters:
        -v|--verbose                  Set verbose mode
        -f|--hostfile <hostfile>      File with newline separated hostnames to run tests on.
        -r|--hpcx_dir <path>          Path to HPCX installation root folder (or use env HPCX_DIR)

	Options:
        -p|--ppn <number>             Select number of processes per hostname (default: 1)
        -d|--hca_list "string"        Comma separated list of HCAs to use (default: autoselect)
        -t|--transport_list "string"  List of RDMA transports to use (rc,dc,ud) (default: autoselect best)
        -cx7|--connectx-7             Configure to use 4 QPs for ConnectX-7 to maximize performance
        -z|--traffic <nn>             Run traffic for 'nn' minutes. do NOT use during performance testing!
        -s|--ssh                      Use ssh for process launching (default: autoselect)
        -h|--help                     Show help message
        -n|--dry-run                  Dry run (do nothing, only print)
        -m|--map-by                   [node|core|socket]  (Used in MPI argument: -- map-by ppr:ppn:map-by)
        -y|--bycore                   Run on ALL cores, not just a single core per node
        -k|--test_intra_node          Run intra-node tests for bandwidth and latency (default: skip intra-node)
				      Is ignored if --output
        -U|--unidirectional           Run unidirectional bandwidth tests (default: bidirectional)
        -e|--mapper                   shell script that maps local MPI rank to a core and one or more HCAs
                                      e.g. for testing machines with multiple HCAs, where each HCA needs to be tested
        -g|--gpu                      Run GPU lat/bw/neighbor tests
        -G|--gpudirect                Run GPU tests with GPU-Direct.
        -w|--rdma_write               Use RDMA-write to pass data to the remote host.
        -o|--rdma_read                Use RDMA-read to access data from the remote host.
        -P|--performance              Set CPU scaling governor to 'performance'. Set back to 'powersave' after execution
        -a|--output                   Generate zip of heatmaps and tgz of JSON files from output. Overrides -k.
        output options:
            -l|--normalize        Normalize latency results                       default: false
            -C|--clean            Erase output cache directory                    default: false
            -R|--report           create report (PDF)                             default: false

        -x|--exe_opt                  Options for clusterkit.
        -i|--mpi_opt                  Options for mpirun.


    To pass additional MPI options, use the mpi_opt environment variable.
    To pass additional options to the clusterkit executable, use the exe_opt environment variable.

        Examples:
               % $0 --ssh --hostfile hostfile.txt

               % $0 --hca_list "mlx5_0:1,mlx5_2:1" --hostfile hostfile.txt

               % exe_opt="--gpudirect     "             $0 --hca_list "mlx5_0:1,mlx5_2:1" --hostfile hostfile.txt

               % mpi_opt="-x UCX_RNDV_SCHEME=get_zcopy" $0 --hca_list "mlx5_0:1,mlx5_2:1" --hostfile hostfile.txt



HELP_USAGE
}

verbose=0
log_verbose () {
	if [ $verbose -eq 1 ]; then
		echo "$1"
	fi
}

is_ucx_cuda_installed () {
	rc=1
	DPKG_QUERY=/usr/bin/dpkg-query
	RPM=/usr/bin/rpm
	if [[ -x $DPKG_QUERY ]]; then
		avail=`$DPKG_QUERY -l ucx-cuda 2> /dev/null | awk '/^[rhi][iU]/{print $2}'`
		if [ ! -z $avail ]; then
			rc=0 # installed
		fi
	elif [[ -x $RPM ]]; then
		# if ucx_cuda is installed, rc will be 0 (true)
		$RPM -q ucx-cuda > /dev/null 2>&1
		rc=$?
	fi

	return $rc
}

ppn=1
nodes=
debug=
gpu=0
end_time=0
test_intra_node=0
performance=0
output=0
output_args=""

exe_opt=${exe_opt:-""}
exe_args=""
mpi_opt=${mpi_opt:-""}
mpi_opt+=" "
opt=${opt:-""} # Backward compatibility - DELETE on next release
mpi_opt+="$opt " # Backward compatibility - DELETE on next release

set_output_dir () {
	if [[ -z "${CK_OUTPUT_SUBDIR}" ]]; then
		output_dir=$(date "+%Y%m%d_%H%M%S")
		export CK_OUTPUT_SUBDIR=`pwd`/$output_dir
		mpi_opt+="-x CK_OUTPUT_SUBDIR "
	fi
}

set_scaling_governor() {
	local arg=$1
	local tmp_script=tmp_scaling_governor.sh
	echo "Setting CPU scaling governor to '${arg}' on all nodes"
(
cat << __EOF
#!/bin/bash
echo $arg | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor &> /dev/null
__EOF
) > $tmp_script
	chmod +x $tmp_script
	$MPI_HOME/bin/mpirun --map-by ppr:1:node $nodes $tmp_script
	rm -f $tmp_script
}

while [[ $# -gt 0 ]]
do
	key="${1}"
	case ${key} in
		-s|--ssh)
			mpi_opt+="--mca plm_rsh_agent ssh "
			;;
		-v|--verbose)
			verbose=1
			;;
		-n|--dry-run)
			debug=echo
			;;
		-f|--hostfile)
			nodes="-machinefile ${2}"
			shift # past argument
			;;
		-r|--hpcx_dir)
			HPCX_DIR="${2}"
			shift # past argument
			;;
		-p|--ppn)
			ppn="${2}"
			shift # past argument
			;;
		-m|--map-by)
			mapby="${2}"
			shift # past argument
			;;
		-y|--bycore)
			exe_args+="--bycore "
			;;
		-k|--test_intra_node)
			test_intra_node=1
			;;
		-U|--unidirectional)
			exe_args+="${1} "
			;;
		-g|--gpu)
			gpu=1
			exe_args+="-d gpumb "
			exe_args+="-d gpu_gpu_lat -d gpu_gpu_bw "
			exe_args+="-d gpu_host_lat -d gpu_host_bw "
			exe_args+="-d gpu_neighbor_lat -d gpu_neighbor_bw "
			;;
		-G|--gpudirect)
			exe_opt+="${1} "
			;;
		-e|--mapper)
			mapper="${2}"
                        exe_args+="--bycore "
			shift # past argument
			;;
		-d|--hca_list)
			mpi_opt+="-x UCX_NET_DEVICES=${2} "
			shift # past argument
			;;
		-t|--transport_list)
			mpi_opt+="-x UCX_TLS=${2} "
			shift # past argument
			;;
		-cx7|--connectx-7)
			mpi_opt+="-x UCX_MAX_RNDV_LANES=4 "
			;;
		-w|--rdma_write)
			mpi_opt+="-x UCX_RNDV_SCHEME=put_zcopy "
			;;
		-o|--rdma_read)
			mpi_opt+="-x UCX_RNDV_SCHEME=get_zcopy "
			;;
		-P|--performance)
			performance=1
			;;
		-a|--output)
			output=1
			set_output_dir
			;;
		# output options - fall through to same operation
		-l|--normalize) ;&
		--per_node) ;&
		-C|--clean) ;&
		-R|--report)
			output_args+="${1} "
			;;
		-x|--exe_opt)
			exe_opt+="${2} "
			shift # past argument
			;;
		-i|--mpi_opt)
			mpi_opt+="${2} "
			shift # past argument
			;;
		-z|--traffic)
			exe_args+="--run-time ${2} "
			shift # past argument
			# we are not interested in multiple output directories, send all output to the same dir
			set_output_dir

			# setup so that we are constantly checking N/2 pairs, i.e.all nodes
			exe_args+="--btol=.2 --biters=10000 -d bw --bycore "
			;;
		-h|--help)
			usage
			exit 0
			shift # past argument
			;;
		*)    # unknown option
			echo Error: unknown option $key
			usage
			exit 1
			shift # past argument
			;;
	esac
    shift
done

if [ $test_intra_node -eq 0 ] || [ $output -eq 1 ]; then
	exe_args+="--skip_intra_node "
fi

if [ "$mapby" == "core" ] && [ -n "$mapper" ]; then
    echo "Error: Cannot map by core and also use a mapper"
    exit 1
fi

exe=
SCRIPT_DIR=$(cd $(dirname "$0") && pwd)
if [ "$SCRIPT_DIR" == "/usr/bin" ]; then
    log_verbose "MLNX_OFED Mode"
    if [ -z $MPI_HOME ]; then
        MPI_HOME="/usr/mpi/gcc/openmpi-4.1.7rc1"
    fi
    if [ ! -f $MPI_HOME/bin/mpirun ]; then
        echo "Error: openmpi installation not found in $MPI_HOME, please set MPI_HOME"
        exit 1
    fi

elif [ ! -z $HPCX_DIR ]; then
    log_verbose "HPCX Mode"
    [ ! -f "${HPCX_DIR}/hpcx-init.sh" ] && { echo Error: hpcx installation not found $HPCX_DIR; usage; } && exit 1
    source ${HPCX_DIR}/hpcx-init.sh
    hpcx_load

    exe=$HPCX_CLUSTERKIT_DIR/bin/clusterkit

else
    # HPCX not specified, try to use ompi MOFED installation path
    log_verbose "Searching MLNX_OFED Mode"
    if [ -z $MPI_HOME ]; then
        OMPI_SEARCH_DIR=/usr/mpi/gcc/
        MPI_HOME_LST=$(find $OMPI_SEARCH_DIR -maxdepth 1 -name "openmpi-*")
        for mpi_h in $MPI_HOME_LST; do
            if [ -f $mpi_h/bin/mpirun ]; then
                MPI_HOME=$mpi_h
                break
            fi
        done
        if [ -z $MPI_HOME ]; then
            echo "Error: openmpi installation not found in $OMPI_SEARCH_DIR, please set MPI_HOME or HPCX_DIR"
            exit 1
        fi
    fi
fi

exe=${exe:-"$SCRIPT_DIR/clusterkit"}
# exe=./bin/clusterkit # Comment this line before COMMIT
if [ ! -f $exe ]; then
    echo "Error: clusterkit executable not found in $exe, please re-install MOFED or set HPCX_DIR"
    exit 1
fi

ppr=$ppn


echo "MPI_HOME=$MPI_HOME"
MPI_LIB=$(find $MPI_HOME -maxdepth 1 \( -name "lib64" -o -name "lib" \))
if [ -z $MPI_LIB ]; then
        echo "Error: openmpi lib not found in $MPI_HOME, please set MPI_HOME"
        exit 1
fi

if [ -z $HPCX_DIR ]; then
    if [ $gpu -eq 1 ] && ! is_ucx_cuda_installed ; then
        echo "[WARNING] ucx-cuda is not installed, GPU tests may not run"
    fi
fi

export LD_LIBRARY_PATH=$MPI_LIB:$LD_LIBRARY_PATH
export PATH=$MPI_HOME/bin:$PATH
export OPAL_PREFIX=$MPI_HOME
 
mpi_opt+="--allow-run-as-root "
mpi_opt+="--mca pml ucx --mca coll ^hcoll "
mapby=${mapby:-"node"}
mpi_opt+="--map-by ppr:$ppr:$mapby "
mpi_opt+="-x PATH -x LD_LIBRARY_PATH -x OPAL_PREFIX "
#mpi_opt+="--mca routed direct " # Sometimes required.
#mpi_opt+="--mca plm_rsh_num_concurrent 128 " # Sometimes needed, but generally NOT. Will impact how fast a job is launched
#mpi_opt+="--mca plm_rsh_no_tree_spawn 1 " # Sometimes needed, but generally NOT. Will impact how fast a job is launched
 
echo "mpi_opt=$mpi_opt"
echo "exe_opt=$exe_opt"

if [[ $performance -eq 1 ]]; then
	set_scaling_governor performance
fi

if [ $verbose -eq 1 ]; then
    set -x
fi

$debug $MPI_HOME/bin/mpirun $mpi_opt $nodes $mapper $exe $exe_args $exe_opt

if [[ $performance -eq 1 ]]; then
	set_scaling_governor powersave
fi

if [ $output -eq 1 ];then

	echo;echo;echo
	# check if bycore suffix exists, to infer if execution was per node/hca
	# edge case: node name ends with '-000'. fix is to explicitely add --per_node in CLI
	bycore_suffix=$( grep "\-000\": 0," ${CK_OUTPUT_SUBDIR}/latency.json | wc -l )
	if [[ $bycore_suffix -eq 0 ]]; then
		output_args+="--per_node "
	fi
	echo Clusterkit finished, handling output files...
	source_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
	$source_dir/output/run.sh $output_args -o $CK_OUTPUT_SUBDIR
fi
