From 7973707c86acbf67116261952fe36cdf658435bd Mon Sep 17 00:00:00 2001 From: Scot Breitenfeld Date: Wed, 26 Apr 2023 17:56:17 -0500 Subject: H5fuse.sh optimization updates (#2806) Changed to processing subfiles at the subfile level. Simplified parameter arguments. Enabled running it in parallel. Added option to specify subfiling configuration location. --- utils/subfiling_vfd/h5fuse.sh.in | 222 +++++++++++++++++++++++++++++---------- 1 file changed, 165 insertions(+), 57 deletions(-) diff --git a/utils/subfiling_vfd/h5fuse.sh.in b/utils/subfiling_vfd/h5fuse.sh.in index 48e3e61..09a3a05 100755 --- a/utils/subfiling_vfd/h5fuse.sh.in +++ b/utils/subfiling_vfd/h5fuse.sh.in @@ -14,6 +14,7 @@ BLD='\033[1m' GRN='\033[0;32m' RED='\033[0;31m' PUR='\033[0;35m' +CYN='\033[0;36m' NC='\033[0m' # No Color ############################################################ @@ -23,15 +24,40 @@ function usage() { echo "" # Display usage echo "Purpose: Combine subfiles into a single HDF5 file. Requires the subfiling - configuration file either as a command-line argument, or the script will + configuration file either as a command-line argument or the script will search for the *.config file in the current directory." echo "" - echo "usage: h5fuse.sh [-h] [-f filename]" - echo "-h Print this help." - echo "-f filename Subfile configuration file." + echo "usage: h5fuse.sh [-f filename] [-h] [-p] [-q] [-r] [-v] " + echo "-f filename Subfile configuration file." + echo "-h Print this help." + echo "-q Quiet all output. [no]" + echo "-p h5fuse.sh is being run in parallel, with more than one rank. [no]" + echo "-r Remove subfiles after being processed. [no]" + echo "-v Verbose output. [no]" echo "" } +function gen_mpi() { + +# Program to determine MPI rank and size if being run in parallel (-p). + +cat > "${c_src}" << EOL +#include +#include +int main() { + MPI_Init(NULL, NULL); + int world_size; + MPI_Comm_size(MPI_COMM_WORLD, &world_size); + int world_rank; + MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); + printf("%d %d", world_rank, world_size); + MPI_Barrier(MPI_COMM_WORLD); + MPI_Finalize(); +} +EOL + +} + ############################################################ ############################################################ # Main program # @@ -43,14 +69,25 @@ function usage() { ############################################################ # Get the options file_config="" - -while getopts ":h:f:" option; do +verbose="false" +quiet="false" +rm_subf="false" +parallel="false" +while getopts "hpqrvf:" option; do case $option in + f) # subfiling configuration file + file_config=$OPTARG;; h) # display Help usage exit;; - f) # subfiling configuration file - file_config=$OPTARG;; + p) # HDF5 fused file + parallel="true";; + q) # quiet all output + quiet="true";; + r) # remove completed subfiles + rm_subf="true";; + v) # verbose output + verbose="true";; \?) # Invalid option echo -e "$RED ERROR: Invalid option ${BLD}-${OPTARG}${RED} $NC" usage @@ -61,12 +98,20 @@ while getopts ":h:f:" option; do done FAILED=1 -nfiles=1 ############################################################ # Configure file checks # ############################################################ +# +SUBF_CONFDIR="${H5FD_SUBFILING_CONFIG_FILE_PREFIX:-.}" + +# cd to the subfile configuration location +if [ "$SUBF_CONFDIR" != "." ] || [ "$SUBF_CONFDIR" != "$PWD" ]; then + cd "$SUBF_CONFDIR" || exit +fi + +# Try to find the config file if [ -z "$file_config" ]; then - nfiles=$(find . -maxdepth 1 -type f -iname "*.config" -printf '.' | wc -m) + nfiles=$(find "$SUBF_CONFDIR" -maxdepth 1 -type f -iname "*.config" -printf '.' | wc -m) if [[ "$nfiles" != "1" ]]; then if [[ "$nfiles" == "0" ]]; then echo -e "$RED Failed to find .config file in current directory. $NC" @@ -78,7 +123,7 @@ if [ -z "$file_config" ]; then exit $FAILED fi fi - file_config=$(find . -maxdepth 1 -type f -iname "*.config") + file_config=$(find "$SUBF_CONFDIR" -maxdepth 1 -type f -iname '*.config') fi if [ ! -f "$file_config" ]; then @@ -104,62 +149,125 @@ if test -z "$subfile_dir"; then exit $FAILED fi -subfiles=( $( sed -e '1,/subfile_dir=/d' "$file_config" ) ) -#for i in "${subfiles[@]}"; do -# echo "$i" -#done +# For bash 4.4+ +mapfile -t subfiles < <( sed -e '1,/subfile_dir=/d' "$file_config" ) if [ ${#subfiles[@]} -eq 0 ]; then echo -e "$RED failed to find subfiles list in $file_config $NC" exit $FAILED fi +nsubfiles=${#subfiles[@]} + +# Get the number of local subfiles +subfiles_loc=() +subfiles_size=() +for i in "${subfiles[@]}"; do + subfile="${subfile_dir}/${i}" + if [ -f "${subfile}" ]; then + subfiles_loc+=("$subfile") + subfiles_size+=($(wc -c "${subfile}" | awk '{print $1}')) + else + subfiles_size+=(0) + fi +done + +START="$(date +%s%N)" + +mpi_rank=0 +mpi_size=1 +nstart=1 +nend=$nsubfiles + +if [ "$parallel" == "true" ]; then + + hex=$(hexdump -n 16 -v -e '/1 "%02X"' /dev/urandom) + c_exec="h5fuse_"${hex} + c_src=${c_exec}.c + + # Generate and compile an MPI program to get MPI rank and size + if [ ! -f "${c_src}" ]; then + gen_mpi + CC=@CC@ + ${CC} "${c_src}" -o "${c_exec}" + fi + wait + rank_size=$(./"${c_exec}") + read -r mpi_rank mpi_size <<<"$rank_size" -rm -f "$hdf5_file" + rm -f "${c_src}" "${c_exec}" -## COMBINE SUBFILES INTO AN HDF5 FILE ## + # Divide the subfiles among the ranks + iwork1=$(( nsubfiles / mpi_size )) + iwork2=$(( nsubfiles % mpi_size )) + min=$(( mpi_rank < iwork2 ? mpi_rank : iwork2 )) + nstart=$(( mpi_rank * iwork1 + 1 + min )) + nend=$(( nstart + iwork1 - 1 )) + if [ $iwork2 -gt "$mpi_rank" ]; then + nend=$(( nend + 1 )) + fi +fi +############################################################ +# COMBINE SUBFILES INTO AN HDF5 FILE # +############################################################ +icnt=1 skip=0 -status=$nfiles -START="$(date +%s%N)" -while [ "$status" -gt 0 ]; do - icnt=0 - for i in "${subfiles[@]}"; do - subfile="${subfile_dir}/${i}" - # Verify the file exists - if [ ! -f "${subfile}" ]; then - echo -e "$RED ERROR: file \"${subfile}\" does not exist. $NC" - exit $FAILED - fi +seek=0 +seek_cnt=0 +for i in "${subfiles[@]}"; do - # Verify the file is not being accessed by a process - t_max=60 - t_sleep=1 - t_elapsed=0 + subfile="${subfile_dir}/${i}" - while fuser -s "${subfile}"; do - if [[ $((t_elapsed % 5)) -eq 0 ]]; then - echo -e "$GRN waiting for process to finish accessing file \"${subfile}\" ... [${t_elapsed}s/${t_max}s] $NC" - fi - sleep $t_sleep - t_elapsed=$((t_elapsed+t_sleep)) - if [[ $t_elapsed -ge $t_max ]]; then - echo -e "$RED ERROR: file \"${subfile}\" still has process accessing it after ${t_elapsed}s $NC" - exit $FAILED + # bs=BYTES read and write up to BYTES bytes at a time; overrides ibs and obs + # ibs=BYTES read up to BYTES bytes at a time + # obs=BYTES write BYTES bytes at a time + # seek=N skip N obs-sized blocks at start of output + # skip=N skip N ibs-sized blocks at start of input + + status=1 + fsize=${subfiles_size[icnt-1]} + if [ "$fsize" -eq "0" ]; then + seek_cnt=$((seek_cnt+1)) + seek=$seek_cnt + if [ "$rm_subf" == "true" ]; then + if [ -f "${subfile}" ]; then + \rm -f "$subfile" + fi + fi + else + if [ $icnt -ge "$nstart" ] && [ $icnt -le "$nend" ]; then + records_left=$fsize + while [ "$status" -gt 0 ]; do + if [ $((skip*stripe_size)) -le "$fsize" ] && [ "$records_left" -gt 0 ]; then + EXEC="dd count=1 bs=$stripe_size if=$subfile of=$hdf5_file skip=$skip seek=$seek conv=notrunc" + if [ "$verbose" == "true" ]; then + echo -e "$GRN $EXEC $NC" + fi + err=$( $EXEC 2>&1 1>/dev/null ) + if [ $? -ne 0 ]; then + echo -e "$CYN ERR: dd Utility Failed $NC" + echo -e "$CYN MSG: $err $NC" + exit $FAILED + fi + records_left=$((records_left-stripe_size)) + skip=$((skip+1)) + seek=$((seek_cnt+skip*nsubfiles)) + else + status=0 + skip=0 + fi + done; wait + if [ "$rm_subf" == "true" ]; then + \rm -f "$subfile" fi - done - - fsize=$(wc -c "${subfile}" | awk '{print $1}') - if [ $((skip*stripe_size)) -le "$fsize" ]; then - EXEC="dd count=1 bs=$stripe_size if=$subfile of=$hdf5_file skip=$skip oflag=append conv=notrunc" - echo -e "$GRN $EXEC $NC" - err="$( $EXEC 2>&1 > /dev/null &)" - icnt=$((icnt+1)) - else - subfiles=("${subfiles[@]:0:icnt}" "${subfiles[@]:$((icnt+1))}") - status=${#subfiles[@]} - fi - done; wait - skip=$((skip+1)) -done + fi + seek_cnt=$((seek_cnt+1)) + seek=$seek_cnt + fi + icnt=$(( icnt +1 )) +done; wait + END=$(( $(date +%s%N) - START )) DURATION_SEC=$(awk -vp="$END" -vq=0.000000001 'BEGIN{printf "%.4f" ,p * q}') -echo -e "$PUR COMPLETION TIME = $DURATION_SEC s $NC" +if [ "$quiet" == "false" ]; then + echo -e "$PUR COMPLETION TIME = $DURATION_SEC s $NC" +fi \ No newline at end of file -- cgit v0.12