summaryrefslogtreecommitdiffstats
path: root/utils/subfiling_vfd
diff options
context:
space:
mode:
authorScot Breitenfeld <brtnfld@hdfgroup.org>2023-04-26 22:56:17 (GMT)
committerGitHub <noreply@github.com>2023-04-26 22:56:17 (GMT)
commit7973707c86acbf67116261952fe36cdf658435bd (patch)
treedaeb17b8564be1b37743d58c285640e16e6e7eaa /utils/subfiling_vfd
parent81bb90e77b5a16548583015c6ffca6a0b89284bc (diff)
downloadhdf5-7973707c86acbf67116261952fe36cdf658435bd.zip
hdf5-7973707c86acbf67116261952fe36cdf658435bd.tar.gz
hdf5-7973707c86acbf67116261952fe36cdf658435bd.tar.bz2
H5fuse.sh optimization updates (#2806)
Changed to processing subfiles at the subfile level. Simplified parameter arguments. Enabled running it in parallel. Added option to specify subfiling configuration location.
Diffstat (limited to 'utils/subfiling_vfd')
-rwxr-xr-xutils/subfiling_vfd/h5fuse.sh.in222
1 files changed, 165 insertions, 57 deletions
diff --git a/utils/subfiling_vfd/h5fuse.sh.in b/utils/subfiling_vfd/h5fuse.sh.in
index 48e3e61..09a3a05 100755
--- a/utils/subfiling_vfd/h5fuse.sh.in
+++ b/utils/subfiling_vfd/h5fuse.sh.in
@@ -14,6 +14,7 @@ BLD='\033[1m'
GRN='\033[0;32m'
RED='\033[0;31m'
PUR='\033[0;35m'
+CYN='\033[0;36m'
NC='\033[0m' # No Color
############################################################
@@ -23,15 +24,40 @@ function usage() {
echo ""
# Display usage
echo "Purpose: Combine subfiles into a single HDF5 file. Requires the subfiling
- configuration file either as a command-line argument, or the script will
+ configuration file either as a command-line argument or the script will
search for the *.config file in the current directory."
echo ""
- echo "usage: h5fuse.sh [-h] [-f filename]"
- echo "-h Print this help."
- echo "-f filename Subfile configuration file."
+ echo "usage: h5fuse.sh [-f filename] [-h] [-p] [-q] [-r] [-v] "
+ echo "-f filename Subfile configuration file."
+ echo "-h Print this help."
+ echo "-q Quiet all output. [no]"
+ echo "-p h5fuse.sh is being run in parallel, with more than one rank. [no]"
+ echo "-r Remove subfiles after being processed. [no]"
+ echo "-v Verbose output. [no]"
echo ""
}
+function gen_mpi() {
+
+# Program to determine MPI rank and size if being run in parallel (-p).
+
+cat > "${c_src}" << EOL
+#include <mpi.h>
+#include <stdio.h>
+int main() {
+ MPI_Init(NULL, NULL);
+ int world_size;
+ MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+ int world_rank;
+ MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
+ printf("%d %d", world_rank, world_size);
+ MPI_Barrier(MPI_COMM_WORLD);
+ MPI_Finalize();
+}
+EOL
+
+}
+
############################################################
############################################################
# Main program #
@@ -43,14 +69,25 @@ function usage() {
############################################################
# Get the options
file_config=""
-
-while getopts ":h:f:" option; do
+verbose="false"
+quiet="false"
+rm_subf="false"
+parallel="false"
+while getopts "hpqrvf:" option; do
case $option in
+ f) # subfiling configuration file
+ file_config=$OPTARG;;
h) # display Help
usage
exit;;
- f) # subfiling configuration file
- file_config=$OPTARG;;
+ p) # HDF5 fused file
+ parallel="true";;
+ q) # quiet all output
+ quiet="true";;
+ r) # remove completed subfiles
+ rm_subf="true";;
+ v) # verbose output
+ verbose="true";;
\?) # Invalid option
echo -e "$RED ERROR: Invalid option ${BLD}-${OPTARG}${RED} $NC"
usage
@@ -61,12 +98,20 @@ while getopts ":h:f:" option; do
done
FAILED=1
-nfiles=1
############################################################
# Configure file checks #
############################################################
+#
+SUBF_CONFDIR="${H5FD_SUBFILING_CONFIG_FILE_PREFIX:-.}"
+
+# cd to the subfile configuration location
+if [ "$SUBF_CONFDIR" != "." ] || [ "$SUBF_CONFDIR" != "$PWD" ]; then
+ cd "$SUBF_CONFDIR" || exit
+fi
+
+# Try to find the config file
if [ -z "$file_config" ]; then
- nfiles=$(find . -maxdepth 1 -type f -iname "*.config" -printf '.' | wc -m)
+ nfiles=$(find "$SUBF_CONFDIR" -maxdepth 1 -type f -iname "*.config" -printf '.' | wc -m)
if [[ "$nfiles" != "1" ]]; then
if [[ "$nfiles" == "0" ]]; then
echo -e "$RED Failed to find .config file in current directory. $NC"
@@ -78,7 +123,7 @@ if [ -z "$file_config" ]; then
exit $FAILED
fi
fi
- file_config=$(find . -maxdepth 1 -type f -iname "*.config")
+ file_config=$(find "$SUBF_CONFDIR" -maxdepth 1 -type f -iname '*.config')
fi
if [ ! -f "$file_config" ]; then
@@ -104,62 +149,125 @@ if test -z "$subfile_dir"; then
exit $FAILED
fi
-subfiles=( $( sed -e '1,/subfile_dir=/d' "$file_config" ) )
-#for i in "${subfiles[@]}"; do
-# echo "$i"
-#done
+# For bash 4.4+
+mapfile -t subfiles < <( sed -e '1,/subfile_dir=/d' "$file_config" )
if [ ${#subfiles[@]} -eq 0 ]; then
echo -e "$RED failed to find subfiles list in $file_config $NC"
exit $FAILED
fi
+nsubfiles=${#subfiles[@]}
+
+# Get the number of local subfiles
+subfiles_loc=()
+subfiles_size=()
+for i in "${subfiles[@]}"; do
+ subfile="${subfile_dir}/${i}"
+ if [ -f "${subfile}" ]; then
+ subfiles_loc+=("$subfile")
+ subfiles_size+=($(wc -c "${subfile}" | awk '{print $1}'))
+ else
+ subfiles_size+=(0)
+ fi
+done
+
+START="$(date +%s%N)"
+
+mpi_rank=0
+mpi_size=1
+nstart=1
+nend=$nsubfiles
+
+if [ "$parallel" == "true" ]; then
+
+ hex=$(hexdump -n 16 -v -e '/1 "%02X"' /dev/urandom)
+ c_exec="h5fuse_"${hex}
+ c_src=${c_exec}.c
+
+ # Generate and compile an MPI program to get MPI rank and size
+ if [ ! -f "${c_src}" ]; then
+ gen_mpi
+ CC=@CC@
+ ${CC} "${c_src}" -o "${c_exec}"
+ fi
+ wait
+ rank_size=$(./"${c_exec}")
+ read -r mpi_rank mpi_size <<<"$rank_size"
-rm -f "$hdf5_file"
+ rm -f "${c_src}" "${c_exec}"
-## COMBINE SUBFILES INTO AN HDF5 FILE ##
+ # Divide the subfiles among the ranks
+ iwork1=$(( nsubfiles / mpi_size ))
+ iwork2=$(( nsubfiles % mpi_size ))
+ min=$(( mpi_rank < iwork2 ? mpi_rank : iwork2 ))
+ nstart=$(( mpi_rank * iwork1 + 1 + min ))
+ nend=$(( nstart + iwork1 - 1 ))
+ if [ $iwork2 -gt "$mpi_rank" ]; then
+ nend=$(( nend + 1 ))
+ fi
+fi
+############################################################
+# COMBINE SUBFILES INTO AN HDF5 FILE #
+############################################################
+icnt=1
skip=0
-status=$nfiles
-START="$(date +%s%N)"
-while [ "$status" -gt 0 ]; do
- icnt=0
- for i in "${subfiles[@]}"; do
- subfile="${subfile_dir}/${i}"
- # Verify the file exists
- if [ ! -f "${subfile}" ]; then
- echo -e "$RED ERROR: file \"${subfile}\" does not exist. $NC"
- exit $FAILED
- fi
+seek=0
+seek_cnt=0
+for i in "${subfiles[@]}"; do
- # Verify the file is not being accessed by a process
- t_max=60
- t_sleep=1
- t_elapsed=0
+ subfile="${subfile_dir}/${i}"
- while fuser -s "${subfile}"; do
- if [[ $((t_elapsed % 5)) -eq 0 ]]; then
- echo -e "$GRN waiting for process to finish accessing file \"${subfile}\" ... [${t_elapsed}s/${t_max}s] $NC"
- fi
- sleep $t_sleep
- t_elapsed=$((t_elapsed+t_sleep))
- if [[ $t_elapsed -ge $t_max ]]; then
- echo -e "$RED ERROR: file \"${subfile}\" still has process accessing it after ${t_elapsed}s $NC"
- exit $FAILED
+ # bs=BYTES read and write up to BYTES bytes at a time; overrides ibs and obs
+ # ibs=BYTES read up to BYTES bytes at a time
+ # obs=BYTES write BYTES bytes at a time
+ # seek=N skip N obs-sized blocks at start of output
+ # skip=N skip N ibs-sized blocks at start of input
+
+ status=1
+ fsize=${subfiles_size[icnt-1]}
+ if [ "$fsize" -eq "0" ]; then
+ seek_cnt=$((seek_cnt+1))
+ seek=$seek_cnt
+ if [ "$rm_subf" == "true" ]; then
+ if [ -f "${subfile}" ]; then
+ \rm -f "$subfile"
+ fi
+ fi
+ else
+ if [ $icnt -ge "$nstart" ] && [ $icnt -le "$nend" ]; then
+ records_left=$fsize
+ while [ "$status" -gt 0 ]; do
+ if [ $((skip*stripe_size)) -le "$fsize" ] && [ "$records_left" -gt 0 ]; then
+ EXEC="dd count=1 bs=$stripe_size if=$subfile of=$hdf5_file skip=$skip seek=$seek conv=notrunc"
+ if [ "$verbose" == "true" ]; then
+ echo -e "$GRN $EXEC $NC"
+ fi
+ err=$( $EXEC 2>&1 1>/dev/null )
+ if [ $? -ne 0 ]; then
+ echo -e "$CYN ERR: dd Utility Failed $NC"
+ echo -e "$CYN MSG: $err $NC"
+ exit $FAILED
+ fi
+ records_left=$((records_left-stripe_size))
+ skip=$((skip+1))
+ seek=$((seek_cnt+skip*nsubfiles))
+ else
+ status=0
+ skip=0
+ fi
+ done; wait
+ if [ "$rm_subf" == "true" ]; then
+ \rm -f "$subfile"
fi
- done
-
- fsize=$(wc -c "${subfile}" | awk '{print $1}')
- if [ $((skip*stripe_size)) -le "$fsize" ]; then
- EXEC="dd count=1 bs=$stripe_size if=$subfile of=$hdf5_file skip=$skip oflag=append conv=notrunc"
- echo -e "$GRN $EXEC $NC"
- err="$( $EXEC 2>&1 > /dev/null &)"
- icnt=$((icnt+1))
- else
- subfiles=("${subfiles[@]:0:icnt}" "${subfiles[@]:$((icnt+1))}")
- status=${#subfiles[@]}
- fi
- done; wait
- skip=$((skip+1))
-done
+ fi
+ seek_cnt=$((seek_cnt+1))
+ seek=$seek_cnt
+ fi
+ icnt=$(( icnt +1 ))
+done; wait
+
END=$(( $(date +%s%N) - START ))
DURATION_SEC=$(awk -vp="$END" -vq=0.000000001 'BEGIN{printf "%.4f" ,p * q}')
-echo -e "$PUR COMPLETION TIME = $DURATION_SEC s $NC"
+if [ "$quiet" == "false" ]; then
+ echo -e "$PUR COMPLETION TIME = $DURATION_SEC s $NC"
+fi \ No newline at end of file