From 985af5617fe40c9a6e9a0083dc889e56a7add362 Mon Sep 17 00:00:00 2001 From: James Laird Date: Mon, 23 May 2005 13:20:07 -0500 Subject: [svn-r10785] Purpose: Feature Description: Added "support" for UTF-8 character encoding. Solution: Wrote tests to check that UTF-8 can be used in a number of places in HDF5 (object names, data, etc.). These tests live in test/tunicode.c. Added a new UTF-8 character encoding for datatypes. Platforms tested: mir, modi4, heping Misc. update: --- MANIFEST | 1 + fortran/src/H5f90global.f90 | 16 +- release_docs/RELEASE.txt | 3 + src/H5.c | 4 +- src/H5Tconv.c | 6 +- src/H5Tpublic.h | 4 +- test/Makefile.am | 4 +- test/Makefile.in | 8 +- test/testhdf5.c | 1 + test/testhdf5.h | 2 + test/tunicode.c | 837 ++++++++++++++++++++++++++++++++++++++++++++ tools/h5ls/h5ls.c | 4 +- 12 files changed, 872 insertions(+), 18 deletions(-) create mode 100644 test/tunicode.c diff --git a/MANIFEST b/MANIFEST index 61a3f34..c344b93 100644 --- a/MANIFEST +++ b/MANIFEST @@ -1081,6 +1081,7 @@ ./test/ttsafe_cancel.c ./test/ttsafe_dcreate.c ./test/ttsafe_error.c +./test/tunicode.c ./test/tvlstr.c ./test/tvltypes.c ./test/unlink.c diff --git a/fortran/src/H5f90global.f90 b/fortran/src/H5f90global.f90 index c1b8817..b17680e 100644 --- a/fortran/src/H5f90global.f90 +++ b/fortran/src/H5f90global.f90 @@ -470,7 +470,7 @@ ! ! H5T flags declaration ! - INTEGER, PARAMETER :: H5T_FLAGS_LEN = 30 + INTEGER, PARAMETER :: H5T_FLAGS_LEN = 31 INTEGER H5T_flags(H5T_FLAGS_LEN) !DEC$if defined(BUILD_HDF5_DLL) !DEC$ ATTRIBUTES DLLEXPORT :: /H5T_FLAGS/ @@ -503,6 +503,7 @@ INTEGER :: H5T_NORM_MSBSET_F INTEGER :: H5T_NORM_NONE_F INTEGER :: H5T_CSET_ASCII_F + INTEGER :: H5T_CSET_UTF8_F INTEGER :: H5T_STR_NULLTERM_F INTEGER :: H5T_STR_NULLPAD_F INTEGER :: H5T_STR_SPACEPAD_F @@ -532,12 +533,13 @@ EQUIVALENCE(H5T_flags(22), H5T_NORM_MSBSET_F) EQUIVALENCE(H5T_flags(23), H5T_NORM_NONE_F) EQUIVALENCE(H5T_flags(24), H5T_CSET_ASCII_F) - EQUIVALENCE(H5T_flags(25), H5T_STR_NULLTERM_F) - EQUIVALENCE(H5T_flags(26), H5T_STR_NULLPAD_F) - EQUIVALENCE(H5T_flags(27), H5T_STR_SPACEPAD_F) - EQUIVALENCE(H5T_flags(28), H5T_STR_ERROR_F) - EQUIVALENCE(H5T_flags(29), H5T_VLEN_F) - EQUIVALENCE(H5T_flags(30), H5T_ARRAY_F) + EQUIVALENCE(H5T_flags(25), H5T_CSET_UTF8_F) + EQUIVALENCE(H5T_flags(26), H5T_STR_NULLTERM_F) + EQUIVALENCE(H5T_flags(27), H5T_STR_NULLPAD_F) + EQUIVALENCE(H5T_flags(28), H5T_STR_SPACEPAD_F) + EQUIVALENCE(H5T_flags(29), H5T_STR_ERROR_F) + EQUIVALENCE(H5T_flags(30), H5T_VLEN_F) + EQUIVALENCE(H5T_flags(31), H5T_ARRAY_F) ! ! H5Z flags declaration diff --git a/release_docs/RELEASE.txt b/release_docs/RELEASE.txt index cc83743..85dac4f 100644 --- a/release_docs/RELEASE.txt +++ b/release_docs/RELEASE.txt @@ -90,6 +90,9 @@ New Features Library: -------- + - Added H5T_CSET_UTF8 character set to mark datatypes that use the + UTF-8 Unicode character encoding. Added tests to ensure that + library handles UTF-8 object names, attributes, etc. -JL 2005/05/13 - HDF5 supports collective MPI-IO for irregular selection with HDF5 dataset. Irregular selection is when users use H5Sselect_hyperslab more than once for the same dataset. diff --git a/src/H5.c b/src/H5.c index 821ee3a..5956c9e 100644 --- a/src/H5.c +++ b/src/H5.c @@ -2687,7 +2687,9 @@ H5_trace (const double *returning, const char *func, const char *type, ...) case H5T_CSET_ASCII: fprintf (out, "H5T_CSET_ASCII"); break; - case H5T_CSET_RESERVED_1: + case H5T_CSET_UTF8: + fprintf (out, "H5T_CSET_UTF8"); + break; case H5T_CSET_RESERVED_2: case H5T_CSET_RESERVED_3: case H5T_CSET_RESERVED_4: diff --git a/src/H5Tconv.c b/src/H5Tconv.c index 5b4b4b7..d4125a1 100644 --- a/src/H5Tconv.c +++ b/src/H5Tconv.c @@ -3907,8 +3907,10 @@ H5T_conv_s_s (hid_t src_id, hid_t dst_id, H5T_cdata_t *cdata, size_t nelmts, HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "bad precision"); if (0 != src->shared->u.atomic.offset || 0 != dst->shared->u.atomic.offset) HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "bad offset"); - if (H5T_CSET_ASCII != src->shared->u.atomic.u.s.cset || H5T_CSET_ASCII != dst->shared->u.atomic.u.s.cset) - HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "bad character set"); + if (H5T_CSET_ASCII != src->shared->u.atomic.u.s.cset && H5T_CSET_UTF8 != src->shared->u.atomic.u.s.cset) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "bad source character set"); + if (H5T_CSET_ASCII != dst->shared->u.atomic.u.s.cset && H5T_CSET_UTF8 != dst->shared->u.atomic.u.s.cset) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "bad destination character set"); if (src->shared->u.atomic.u.s.pad<0 || src->shared->u.atomic.u.s.pad>=H5T_NPAD || dst->shared->u.atomic.u.s.pad<0 || dst->shared->u.atomic.u.s.pad>=H5T_NPAD) HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "bad character padding"); diff --git a/src/H5Tpublic.h b/src/H5Tpublic.h index 98e92d3..8fe400a 100644 --- a/src/H5Tpublic.h +++ b/src/H5Tpublic.h @@ -78,7 +78,7 @@ typedef enum H5T_norm_t { typedef enum H5T_cset_t { H5T_CSET_ERROR = -1, /*error */ H5T_CSET_ASCII = 0, /*US ASCII */ - H5T_CSET_RESERVED_1 = 1, /*reserved for later use */ + H5T_CSET_UTF8 = 1, /*UTF-8 Unicode encoding */ H5T_CSET_RESERVED_2 = 2, /*reserved for later use */ H5T_CSET_RESERVED_3 = 3, /*reserved for later use */ H5T_CSET_RESERVED_4 = 4, /*reserved for later use */ @@ -94,7 +94,7 @@ typedef enum H5T_cset_t { H5T_CSET_RESERVED_14 = 14, /*reserved for later use */ H5T_CSET_RESERVED_15 = 15 /*reserved for later use */ } H5T_cset_t; -#define H5T_NCSET H5T_CSET_RESERVED_1 /*Number of character sets actually defined */ +#define H5T_NCSET H5T_CSET_RESERVED_2 /*Number of character sets actually defined */ /* * Type of padding to use in character strings. Do not change these values diff --git a/test/Makefile.am b/test/Makefile.am index 4a700a4..b3f888a 100644 --- a/test/Makefile.am +++ b/test/Makefile.am @@ -81,7 +81,7 @@ MOSTLYCLEANFILES=cmpd_dset.h5 compact_dataset.h5 dataset.h5 extend.h5 istore.h5\ extern_[1-3].h5 extern_[1-4][ab].raw gheap[0-4].h5 dt_atomic[1-2]\ links.h5 links[1-3].h5 big.data big[0-9][0-9][0-9][0-9][0-9].h5 \ dtypes[1-8].h5 dt_atomic[1-2].h5 tattr.h5 tselect.h5 mtime.h5 \ - unlink.h5 \ + unlink.h5 unicode.h5 \ fillval_[0-9].h5 fillval.raw mount_[0-9].h5 testmeta.h5 ttime.h5 \ trefer[1-3].h5 tvltypes.h5 tvlstr.h5 tvlstr2.h5 flush.h5 \ enum1.h5 titerate.h5 ttsafe.h5 tarray1.h5 tgenprop.h5 \ @@ -95,6 +95,6 @@ MOSTLYCLEANFILES=cmpd_dset.h5 compact_dataset.h5 dataset.h5 extend.h5 istore.h5\ # Sources for testhdf5 executable testhdf5_SOURCES=testhdf5.c tarray.c tattr.c tconfig.c tfile.c tgenprop.c \ th5s.c theap.c tid.c titerate.c tmeta.c tmisc.c ttime.c trefer.c trefstr.c \ - tselect.c tskiplist.c ttst.c tvltypes.c tvlstr.c + tselect.c tskiplist.c ttst.c tunicode.c tvltypes.c tvlstr.c include $(top_srcdir)/config/conclude.am diff --git a/test/Makefile.in b/test/Makefile.in index d360fef..a996b94 100644 --- a/test/Makefile.in +++ b/test/Makefile.in @@ -236,7 +236,8 @@ am_testhdf5_OBJECTS = testhdf5.$(OBJEXT) tarray.$(OBJEXT) \ tid.$(OBJEXT) titerate.$(OBJEXT) tmeta.$(OBJEXT) \ tmisc.$(OBJEXT) ttime.$(OBJEXT) trefer.$(OBJEXT) \ trefstr.$(OBJEXT) tselect.$(OBJEXT) tskiplist.$(OBJEXT) \ - ttst.$(OBJEXT) tvltypes.$(OBJEXT) tvlstr.$(OBJEXT) + ttst.$(OBJEXT) tunicode.$(OBJEXT) tvltypes.$(OBJEXT) \ + tvlstr.$(OBJEXT) testhdf5_OBJECTS = $(am_testhdf5_OBJECTS) testhdf5_LDADD = $(LDADD) testhdf5_DEPENDENCIES = libh5test.la $(am__DEPENDENCIES_1) @@ -534,7 +535,7 @@ MOSTLYCLEANFILES = cmpd_dset.h5 compact_dataset.h5 dataset.h5 extend.h5 istore.h extern_[1-3].h5 extern_[1-4][ab].raw gheap[0-4].h5 dt_atomic[1-2]\ links.h5 links[1-3].h5 big.data big[0-9][0-9][0-9][0-9][0-9].h5 \ dtypes[1-8].h5 dt_atomic[1-2].h5 tattr.h5 tselect.h5 mtime.h5 \ - unlink.h5 \ + unlink.h5 unicode.h5 \ fillval_[0-9].h5 fillval.raw mount_[0-9].h5 testmeta.h5 ttime.h5 \ trefer[1-3].h5 tvltypes.h5 tvlstr.h5 tvlstr2.h5 flush.h5 \ enum1.h5 titerate.h5 ttsafe.h5 tarray1.h5 tgenprop.h5 \ @@ -549,7 +550,7 @@ MOSTLYCLEANFILES = cmpd_dset.h5 compact_dataset.h5 dataset.h5 extend.h5 istore.h # Sources for testhdf5 executable testhdf5_SOURCES = testhdf5.c tarray.c tattr.c tconfig.c tfile.c tgenprop.c \ th5s.c theap.c tid.c titerate.c tmeta.c tmisc.c ttime.c trefer.c trefstr.c \ - tselect.c tskiplist.c ttst.c tvltypes.c tvlstr.c + tselect.c tskiplist.c ttst.c tunicode.c tvltypes.c tvlstr.c # Automake needs to be taught how to build lib, progs, and tests targets. @@ -808,6 +809,7 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ttsafe_dcreate.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ttsafe_error.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ttst.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tunicode.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tvlstr.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tvltypes.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/unlink.Po@am__quote@ diff --git a/test/testhdf5.c b/test/testhdf5.c index 05149aa..0164b88 100644 --- a/test/testhdf5.c +++ b/test/testhdf5.c @@ -60,6 +60,7 @@ main(int argc, char *argv[]) AddTest("iterate", test_iterate, cleanup_iterate, "Group & Attribute Iteration", NULL); AddTest("array", test_array, cleanup_array, "Array Datatypes", NULL); AddTest("genprop", test_genprop, cleanup_genprop, "Generic Properties", NULL); + AddTest("unicode", test_unicode, cleanup_unicode, "UTF-8 Encoding", NULL); AddTest("misc", test_misc, cleanup_misc, "Miscellaneous", NULL); /* Display testing information */ diff --git a/test/testhdf5.h b/test/testhdf5.h index c32782a..7043950 100644 --- a/test/testhdf5.h +++ b/test/testhdf5.h @@ -130,6 +130,7 @@ void test_configure(void); void test_misc(void); void test_ids(void); void test_skiplist(void); +void test_unicode(void); /* Prototypes for the cleanup routines */ void cleanup_metadata(void); @@ -146,6 +147,7 @@ void cleanup_array(void); void cleanup_genprop(void); void cleanup_configure(void); void cleanup_misc(void); +void cleanup_unicode(void); #ifdef __cplusplus } diff --git a/test/tunicode.c b/test/tunicode.c new file mode 100644 index 0000000..2a4b73e --- /dev/null +++ b/test/tunicode.c @@ -0,0 +1,837 @@ +/* Unicode test */ +#include +#include +#include +#include +#include "testhdf5.h" + +#define NUM_CHARS 16 +#define MAX_STRING_LENGTH ((NUM_CHARS * 4) + 1) /* Max length in bytes */ +#define MAX_PATH_LENGTH (MAX_STRING_LENGTH + 20) /* Max length in bytes */ +#define MAX_CODE_POINT 0x200000 +#define FILENAME "unicode.h5" +/* A buffer to hold two copies of the UTF-8 string */ +#define LONG_BUF_SIZE (2 * MAX_STRING_LENGTH + 4) + +#define DSET1_NAME "fl_string_dataset" +#define DSET2_NAME "dataset2" +#define DSET3_NAME "dataset3" +#define DSET4_NAME "dataset4" +#define VL_DSET1_NAME "vl_dset_1" +#define VL_DSET2_NAME "vl_dset_2" +#define GROUP1_NAME "group1" +#define GROUP2_NAME "group2" +#define GROUP3_NAME "group3" +#define GROUP4_NAME "group4" +#define SLINK_NAME "soft_link" + +#define RANK 1 +#define COMP_INT_VAL 7 +#define COMP_FLOAT_VAL -42.0 +#define COMP_DOUBLE_VAL 42.0 + +/* Test function prototypes */ +void test_fl_string(hid_t fid, const char *string); +void test_strpad(hid_t fid, const char *string); +void test_vl_string(hid_t fid, const char *string); +void test_objnames(hid_t fid, const char *string); +void test_attrname(hid_t fid, const char *string); +void test_compound(hid_t fid, const char *string); +void test_enum(hid_t fid, const char *string); +void test_opaque(hid_t fid, const char *string); + +/* Utility function prototypes */ +static hid_t mkstr(size_t len, H5T_str_t strpad); +unsigned int write_char(unsigned int c, char * test_string, unsigned int cur_pos); +void dump_string(const char * string); + +/* + * test_fl_string + * Tests that UTF-8 can be used for fixed-length string data. + * Writes the string to a dataset and reads it back again. + */ +void test_fl_string(hid_t fid, const char *string) +{ + hid_t dtype_id, space_id, dset_id; + hsize_t dims=1; + char read_buf[MAX_STRING_LENGTH]; + H5T_cset_t cset; + herr_t ret; + + /* Create the datatype, ensure that the character set behaves + * correctly (it should default to ASCII and can be set to UTF8) + */ + dtype_id = H5Tcopy(H5T_C_S1); + CHECK(dtype_id, FAIL, "H5Tcopy"); + ret=H5Tset_size(dtype_id, MAX_STRING_LENGTH); + CHECK(ret, FAIL, "H5Tset_size"); + cset=H5Tget_cset(dtype_id); + VERIFY(cset, H5T_CSET_ASCII, "H5Tget_cset"); + ret=H5Tset_cset(dtype_id, H5T_CSET_UTF8); + CHECK(ret, FAIL, "H5Tset_cset"); + cset=H5Tget_cset(dtype_id); + VERIFY(cset, H5T_CSET_UTF8, "H5Tget_cset"); + + /* Create dataspace for a dataset */ + space_id = H5Screate_simple(RANK, &dims, NULL); + CHECK(space_id, FAIL, "H5Screate_simple"); + /* Create a dataset */ + dset_id=H5Dcreate(fid, DSET1_NAME, dtype_id, space_id, H5P_DEFAULT); + CHECK(dset_id, FAIL, "H5Dcreate"); + + /* Write UTF-8 string to dataset */ + ret = H5Dwrite(dset_id, dtype_id, H5S_ALL, H5S_ALL, H5P_DEFAULT, string); + CHECK(ret, FAIL, "H5Dwrite"); + + /* Read string back and make sure it is unchanged */ + ret = H5Dread(dset_id, dtype_id, H5S_ALL, H5S_ALL, H5P_DEFAULT, read_buf); + CHECK(ret, FAIL, "H5Dread"); + + VERIFY(strcmp(string, read_buf), 0, "strcmp"); + + /* Close all */ + ret=H5Dclose(dset_id); + CHECK(ret, FAIL, "H5Dclose"); + + ret=H5Tclose(dtype_id); + CHECK(ret, FAIL, "H5Tclose"); + ret=H5Sclose(space_id); + CHECK(ret, FAIL, "H5Sclose"); +} + +/* + * test_strpad + * Tests string padding for a UTF-8 string. + * Converts strings to shorter and then longer strings. + * Borrows heavily from dtypes.c, but is more complicated because + * the string is randomly generated. + */ +void test_strpad(hid_t UNUSED fid, const char *string) +{ + /* buf is used to hold the data that H5Tconvert operates on. */ + char buf[LONG_BUF_SIZE]; + + /* cmpbuf holds the output that H5Tconvert should produce, + * to compare against the actual output. */ + char cmpbuf[LONG_BUF_SIZE]; + + /* new_string is a slightly modified version of the UTF-8 + * string to make the tests run more smoothly. */ + char new_string[MAX_STRING_LENGTH + 2]; + + unsigned int length; /* Length of new_string in bytes */ + unsigned int small_len; /* Size of the small datatype */ + unsigned int big_len; /* Size of the larger datatype */ + hid_t src_type, dst_type; + herr_t ret; + + /* The following tests are simpler if the UTF-8 string contains + * the right number of bytes (even or odd, depending on the test). + * We create a 'new_string' whose length is convenient by prepending + * an 'x' to 'string' when necessary. */ + length = strlen(string); + if(length % 2 != 1) + { + strcpy(new_string, "x"); + strcat(new_string, string); + length++; + } else { + strcpy(new_string, string); + } + + + /* Convert a null-terminated string to a shorter and longer null + * terminated string. */ + + /* Create a src_type that holds the UTF-8 string and its final NULL */ + big_len = length + 1; /* +1 byte for final NULL */ + src_type = mkstr(big_len, H5T_STR_NULLTERM); + CHECK(src_type, FAIL, "mkstr"); + /* Create a dst_type that holds half of the UTF-8 string and a final + * NULL */ + small_len = (length +1) / 2; + dst_type = mkstr(small_len, H5T_STR_NULLTERM); + CHECK(dst_type, FAIL, "mkstr"); + + /* Fill the buffer with two copies of the UTF-8 string, each with a + * terminating NULL. It will look like "abcdefg\0abcdefg\0". */ + strncpy(buf, new_string, big_len); + strncpy(&buf[big_len], new_string, big_len); + + ret = H5Tconvert(src_type, dst_type, 2, buf, NULL, H5P_DEFAULT); + CHECK(ret, FAIL, "H5Tconvert"); + + /* After conversion, the buffer should look like + * "abc\0abc\0abcdefg\0". Note that this is just what the bytes look + * like; UTF-8 characters may well have been truncated. + * To check that the conversion worked properly, we'll build this + * string manually. */ + strncpy(cmpbuf, new_string, small_len -1); + cmpbuf[small_len - 1] = '\0'; + strncpy(&cmpbuf[small_len], new_string, small_len -1); + cmpbuf[2 * small_len - 1] = '\0'; + strcpy(&cmpbuf[2 * small_len], new_string); + + VERIFY(HDmemcmp(buf, cmpbuf, 2*big_len), 0, "HDmemcmp"); + + /* Now convert from smaller datatype to bigger datatype. This should + * leave our buffer looking like: "abc\0\0\0\0\0abc\0\0\0\0\0" */ + ret = H5Tconvert(dst_type, src_type, 2, buf, NULL, H5P_DEFAULT); + CHECK(ret, FAIL, "H5Tconvert"); + + /* First fill the buffer with NULLs */ + HDmemset(cmpbuf, '\0', LONG_BUF_SIZE); + /* Copy in the characters */ + strncpy(cmpbuf, new_string, small_len -1); + strncpy(&cmpbuf[big_len], new_string, small_len -1); + + VERIFY(HDmemcmp(buf, cmpbuf, 2*big_len), 0, "HDmemcmp"); + + ret = H5Tclose(src_type); + CHECK(ret, FAIL, "H5Tclose"); + ret = H5Tclose(dst_type); + CHECK(ret, FAIL, "H5Tclose"); + + + /* Now test null padding. Null-padded strings do *not* need + * terminating NULLs, so the sizes of the datatypes are slightly + * different and we want a string with an even number of characters. */ + length = strlen(string); + if(length % 2 != 0) + { + strcpy(new_string, "x"); + strcat(new_string, string); + length++; + } else { + strcpy(new_string, string); + } + + /* Create a src_type that holds the UTF-8 string */ + big_len = length; + src_type = mkstr(big_len, H5T_STR_NULLPAD); + CHECK(src_type, FAIL, "mkstr"); + /* Create a dst_type that holds half of the UTF-8 string */ + small_len = length / 2; + dst_type = mkstr(small_len, H5T_STR_NULLPAD); + CHECK(dst_type, FAIL, "mkstr"); + + /* Fill the buffer with two copies of the UTF-8 string. + * It will look like "abcdefghabcdefgh". */ + strncpy(buf, new_string, big_len); + strncpy(&buf[big_len], new_string, big_len); + + ret = H5Tconvert(src_type, dst_type, 2, buf, NULL, H5P_DEFAULT); + CHECK(ret, FAIL, "H5Tconvert"); + + /* After conversion, the buffer should look like + * "abcdabcdabcdefgh". Note that this is just what the bytes look + * like; UTF-8 characters may well have been truncated. + * To check that the conversion worked properly, we'll build this + * string manually. */ + strncpy(cmpbuf, new_string, small_len); + strncpy(&cmpbuf[small_len], new_string, small_len); + strncpy(&cmpbuf[2 * small_len], new_string, big_len); + + VERIFY(HDmemcmp(buf, cmpbuf, 2*big_len), 0, "HDmemcmp"); + + /* Now convert from smaller datatype to bigger datatype. This should + * leave our buffer looking like: "abcd\0\0\0\0abcd\0\0\0\0" */ + ret = H5Tconvert(dst_type, src_type, 2, buf, NULL, H5P_DEFAULT); + CHECK(ret, FAIL, "H5Tconvert"); + + /* First fill the buffer with NULLs */ + HDmemset(cmpbuf, '\0', LONG_BUF_SIZE); + /* Copy in the characters */ + strncpy(cmpbuf, new_string, small_len); + strncpy(&cmpbuf[big_len], new_string, small_len); + + VERIFY(HDmemcmp(buf, cmpbuf, 2*big_len), 0, "HDmemcmp"); + + ret = H5Tclose(src_type); + CHECK(ret, FAIL, "H5Tclose"); + ret = H5Tclose(dst_type); + CHECK(ret, FAIL, "H5Tclose"); + + + /* Test space padding. This is very similar to null-padding; we can + use the same values of length, small_len, and big_len. */ + + src_type = mkstr(big_len, H5T_STR_SPACEPAD); + CHECK(src_type, FAIL, "mkstr"); + dst_type = mkstr(small_len, H5T_STR_SPACEPAD); + CHECK(src_type, FAIL, "mkstr"); + + /* Fill the buffer with two copies of the UTF-8 string. + * It will look like "abcdefghabcdefgh". */ + strcpy(buf, new_string); + strcpy(&buf[big_len], new_string); + + ret = H5Tconvert(src_type, dst_type, 2, buf, NULL, H5P_DEFAULT); + CHECK(ret, FAIL, "H5Tconvert"); + + /* After conversion, the buffer should look like + * "abcdabcdabcdefgh". Note that this is just what the bytes look + * like; UTF-8 characters may have been truncated. + * To check that the conversion worked properly, we'll build this + * string manually. */ + strncpy(cmpbuf, new_string, small_len); + strncpy(&cmpbuf[small_len], new_string, small_len); + strncpy(&cmpbuf[2 * small_len], new_string, big_len); + + VERIFY(HDmemcmp(buf, cmpbuf, 2*big_len), 0, "HDmemcmp"); + + /* Now convert from smaller datatype to bigger datatype. This should + * leave our buffer looking like: "abcd abcd " */ + ret = H5Tconvert(dst_type, src_type, 2, buf, NULL, H5P_DEFAULT); + CHECK(ret, FAIL, "H5Tconvert"); + + /* First fill the buffer with spaces */ + HDmemset(cmpbuf, ' ', LONG_BUF_SIZE); + /* Copy in the characters */ + strncpy(cmpbuf, new_string, small_len); + strncpy(&cmpbuf[big_len], new_string, small_len); + + VERIFY(HDmemcmp(buf, cmpbuf, 2*big_len), 0, "HDmemcmp"); + + ret = H5Tclose(src_type); + CHECK(ret, FAIL, "H5Tclose"); + ret = H5Tclose(dst_type); + CHECK(ret, FAIL, "H5Tclose"); +} + + +/* + * test_vl_string + * Tests variable-length string datatype with UTF-8 strings. + */ +void test_vl_string(hid_t fid, const char *string) +{ + hid_t type_id, space_id, dset_id; + hsize_t dims=1; + hsize_t size; /* Number of bytes used */ + char *read_buf[1]; + herr_t ret; + + /* Create dataspace for datasets */ + space_id = H5Screate_simple(RANK, &dims, NULL); + CHECK(space_id, FAIL, "H5Screate_simple"); + + /* Create a datatype to refer to */ + type_id = H5Tcopy(H5T_C_S1); + CHECK(type_id, FAIL, "H5Tcopy"); + ret = H5Tset_size(type_id, H5T_VARIABLE); + CHECK(ret, FAIL, "H5Tset_size"); + + /* Create a dataset */ + dset_id=H5Dcreate(fid, VL_DSET1_NAME, type_id, space_id, H5P_DEFAULT); + CHECK(dset_id, FAIL, "H5Dcreate"); + + /* Write dataset to disk */ + ret=H5Dwrite(dset_id, type_id, H5S_ALL, H5S_ALL, H5P_DEFAULT, &string); + CHECK(ret, FAIL, "H5Dwrite"); + + /* Make certain the correct amount of memory will be used */ + ret=H5Dvlen_get_buf_size(dset_id, type_id, space_id, &size); + CHECK(ret, FAIL, "H5Dvlen_get_buf_size"); + VERIFY(size, (hsize_t)strlen(string) + 1, "H5Dvlen_get_buf_size"); + + /* Read dataset from disk */ + ret=H5Dread(dset_id, type_id, H5S_ALL, H5S_ALL, H5P_DEFAULT, read_buf); + CHECK(ret, FAIL, "H5Dread"); + + /* Compare data read in */ + VERIFY(strcmp(string, read_buf[0]), 0, "strcmp"); + + /* Reclaim the read VL data */ + ret=H5Dvlen_reclaim(type_id, space_id, H5P_DEFAULT, read_buf); + CHECK(ret, FAIL, "H5Dvlen_reclaim"); + + /* Close all */ + ret = H5Dclose(dset_id); + CHECK(ret, FAIL, "H5Dclose"); + ret = H5Tclose(type_id); + CHECK(ret, FAIL, "H5Tclose"); + ret = H5Sclose(space_id); + CHECK(ret, FAIL, "H5Sclose"); +} + +/* + * test_objnames + * Tests that UTF-8 can be used for object names in the file. + * Tests groups, datasets, named datatypes, and soft links. + */ +void test_objnames(hid_t fid, const char* string) +{ + hid_t grp_id, grp1_id, grp2_id, grp3_id; + hid_t type_id, dset_id, space_id; + char read_buf[MAX_STRING_LENGTH]; + char path_buf[MAX_PATH_LENGTH]; + hsize_t dims=1; + hobj_ref_t obj_ref; + herr_t ret; + + /* Create a group with a UTF-8 name */ + grp_id = H5Gcreate(fid, string, 0); + CHECK(grp_id, FAIL, "H5Gcreate"); + + /* Set a comment on the group to test that we can access the group + * Also test that UTF-8 comments can be read. + */ + ret = H5Gset_comment(fid, string, string); + CHECK(ret, FAIL, "H5Gset_comment"); + ret = H5Gget_comment(fid, string, MAX_STRING_LENGTH, read_buf); + CHECK(ret, FAIL, "H5Gget_comment"); + + ret = H5Gclose(grp_id); + CHECK(ret, FAIL, "H5Gclose"); + + VERIFY(strcmp(string, read_buf), 0, "strcmp"); + + /* Create a new dataset with a UTF-8 name */ + grp1_id = H5Gcreate(fid, GROUP1_NAME, 0); + CHECK(grp1_id, FAIL, "H5Gcreate"); + + space_id = H5Screate_simple(RANK, &dims, NULL); + CHECK(space_id, FAIL, "H5Screate_simple"); + dset_id=H5Dcreate(grp1_id, string, H5T_NATIVE_INT, space_id, H5P_DEFAULT); + CHECK(dset_id, FAIL, "H5Dcreate"); + + /* Make sure that dataset can be opened again */ + ret=H5Dclose(dset_id); + CHECK(ret, FAIL, "H5Dclose"); + ret=H5Sclose(space_id); + CHECK(ret, FAIL, "H5Sclose"); + + dset_id=H5Dopen(grp1_id, string); + CHECK(ret, FAIL, "H5Dopen"); + ret=H5Dclose(dset_id); + CHECK(ret, FAIL, "H5Dclose"); + ret = H5Gclose(grp1_id); + CHECK(ret, FAIL, "H5Gclose"); + + /* Do the same for a named datatype */ + grp2_id = H5Gcreate(fid, GROUP2_NAME, 0); + CHECK(grp2_id, FAIL, "H5Gcreate"); + + type_id = H5Tcreate(H5T_OPAQUE, 1); + CHECK(type_id, FAIL, "H5Tcreate"); + ret = H5Tcommit(grp2_id, string, type_id); + CHECK(type_id, FAIL, "H5Tcommit"); + ret = H5Tclose(type_id); + CHECK(type_id, FAIL, "H5Tclose"); + + type_id = H5Topen(grp2_id, string); + CHECK(type_id, FAIL, "H5Topen"); + ret = H5Tclose(type_id); + CHECK(type_id, FAIL, "H5Tclose"); + + /* Don't close the group -- use it to test that object references + * can refer to objects named in UTF-8 */ + + space_id = H5Screate_simple(RANK, &dims, NULL); + CHECK(space_id, FAIL, "H5Screate_simple"); + dset_id=H5Dcreate(grp2_id, DSET3_NAME, H5T_STD_REF_OBJ, space_id, H5P_DEFAULT); + CHECK(ret, FAIL, "H5Dcreate"); + + /* Create reference to named datatype */ + ret = H5Rcreate(&obj_ref, grp2_id, string, H5R_OBJECT, -1); + CHECK(ret, FAIL, "H5Rcreate"); + /* Write selection and read it back*/ + ret = H5Dwrite(dset_id, H5T_STD_REF_OBJ, H5S_ALL, H5S_ALL, H5P_DEFAULT, &obj_ref); + CHECK(ret, FAIL, "H5Dwrite"); + ret = H5Dread(dset_id, H5T_STD_REF_OBJ, H5S_ALL, H5S_ALL, H5P_DEFAULT, &obj_ref); + CHECK(ret, FAIL, "H5Dread"); + + /* Ensure that we can open named datatype using object reference */ + type_id = H5Rdereference(dset_id, H5R_OBJECT, &obj_ref); + CHECK(type_id, FAIL, "H5Rdereference"); + ret = H5Tcommitted(type_id); + VERIFY(ret, 1, "H5Tcommitted"); + + ret = H5Tclose(type_id); + CHECK(type_id, FAIL, "H5Tclose"); + ret = H5Dclose(dset_id); + CHECK(ret, FAIL, "H5Dclose"); + ret = H5Sclose(space_id); + CHECK(ret, FAIL, "H5Sclose"); + ret = H5Gclose(grp2_id); + CHECK(ret, FAIL, "H5Gclose"); + + /* Create "group3". Build a hard link from group3 to group2, which has + * a datatype with the UTF-8 name. Create a soft link in group3 + * pointing through the hard link to the datatype. Give the soft + * link a name in UTF-8. Ensure that the soft link works. */ + + grp3_id = H5Gcreate(fid, GROUP3_NAME, 0); + CHECK(grp3_id, FAIL, "H5Gcreate"); + + ret = H5Glink2(fid, GROUP2_NAME, H5G_LINK_HARD, grp3_id, GROUP2_NAME); + CHECK(ret, FAIL, "H5Glink2"); + strcpy(path_buf, GROUP2_NAME); + strcat(path_buf, "/"); + strcat(path_buf, string); + ret = H5Glink(grp3_id, H5G_LINK_SOFT, path_buf, string); + CHECK(ret, FAIL, "H5Glink"); + + /* Open named datatype using soft link */ + type_id = H5Topen(grp3_id, string); + CHECK(type_id, FAIL, "H5Topen"); + + ret = H5Tclose(type_id); + CHECK(type_id, FAIL, "H5Tclose"); + ret = H5Gclose(grp3_id); + CHECK(ret, FAIL, "H5Gclose"); +} + +/* + * test_attrname + * Test that attributes can deal with UTF-8 strings + */ +void test_attrname(hid_t fid, const char * string) +{ + hid_t group_id, attr_id; + hid_t dtype_id, space_id; + hsize_t dims=1; + char read_buf[MAX_STRING_LENGTH]; + herr_t ret; + + /* Create a new group and give it an attribute whose + * name and value are UTF-8 strings. + */ + group_id = H5Gcreate(fid, GROUP4_NAME, 0); + CHECK(group_id, FAIL, "H5Gcreate"); + + space_id = H5Screate_simple(RANK, &dims, NULL); + CHECK(space_id, FAIL, "H5Screate_simple"); + dtype_id = H5Tcopy(H5T_C_S1); + CHECK(dtype_id, FAIL, "H5Tcopy"); + ret=H5Tset_size(dtype_id, MAX_STRING_LENGTH); + CHECK(ret, FAIL, "H5Tset_size"); + + /* Create the attribute and check that its name is correct */ + attr_id = H5Acreate(group_id, string, dtype_id, space_id, H5P_DEFAULT); + CHECK(attr_id, FAIL, "H5Acreate"); + ret = H5Aget_name(attr_id, MAX_STRING_LENGTH, read_buf); + CHECK(ret, FAIL, "H5Aget_name"); + ret = strcmp(read_buf, string); + VERIFY(ret, 0, "strcmp"); + read_buf[0] = '\0'; + + /* Try writing and reading from the attribute */ + ret = H5Awrite(attr_id, dtype_id, string); + CHECK(ret, FAIL, "H5Awrite"); + ret = H5Aread(attr_id, dtype_id, read_buf); + CHECK(ret, FAIL, "H5Aread"); + ret = strcmp(read_buf, string); + VERIFY(ret, 0, "strcmp"); + + /* Clean up */ + ret = H5Aclose(attr_id); + CHECK(ret, FAIL, "H5Aclose"); + ret = H5Tclose(dtype_id); + CHECK(ret, FAIL, "H5Tclose"); + ret = H5Sclose(space_id); + CHECK(ret, FAIL, "H5Sclose"); + ret = H5Gclose(group_id); + CHECK(ret, FAIL, "H5Gclose"); +} + +/* + * test_attrname + * Test that compound datatypes can have UTF-8 field names. + */ +void test_compound(hid_t fid, const char * string) +{ + /* Define two compound structures, s1_t and s2_t. + * s2_t is a subset of s1_t, with two out of three + * fields. + * This is stolen from the h5_compound example. + */ + typedef struct s1_t { + int a; + double c; + float b; + } s1_t; + typedef struct s2_t { + double c; + int a; + } s2_t; + /* Actual variable declarations */ + s1_t s1; + s2_t s2; + hid_t s1_tid, s2_tid; + hid_t space_id, dset_id; + hsize_t dim = 1; + char *readbuf; + herr_t ret; + + /* Initialize compound data */ + s1.a = COMP_INT_VAL; + s1.c = COMP_DOUBLE_VAL; + s1.b = COMP_FLOAT_VAL; + + /* Create compound datatypes using UTF-8 field name */ + s1_tid = H5Tcreate (H5T_COMPOUND, sizeof(s1_t)); + CHECK(s1_tid, FAIL, "H5Tcreate"); + ret = H5Tinsert(s1_tid, string, HOFFSET(s1_t, a), H5T_NATIVE_INT); + CHECK(ret, FAIL, "H5Tinsert"); + + /* Check that the field name was stored correctly */ + readbuf=H5Tget_member_name(s1_tid, 0); + ret = strcmp(readbuf, string); + VERIFY(ret, 0, "strcmp"); + free(readbuf); + + /* Add the other fields to the datatype */ + ret = H5Tinsert(s1_tid, "c_name", HOFFSET(s1_t, c), H5T_NATIVE_DOUBLE); + CHECK(ret, FAIL, "H5Tinsert"); + ret = H5Tinsert(s1_tid, "b_name", HOFFSET(s1_t, b), H5T_NATIVE_FLOAT); + CHECK(ret, FAIL, "H5Tinsert"); + + /* Create second datatype, with only two fields. */ + s2_tid = H5Tcreate (H5T_COMPOUND, sizeof(s2_t)); + CHECK(s2_tid, FAIL, "H5Tcreate"); + ret = H5Tinsert(s2_tid, "c_name", HOFFSET(s2_t, c), H5T_NATIVE_DOUBLE); + CHECK(ret, FAIL, "H5Tinsert"); + ret = H5Tinsert(s2_tid, string, HOFFSET(s2_t, a), H5T_NATIVE_INT); + CHECK(ret, FAIL, "H5Tinsert"); + + /* Create the dataspace and dataset. */ + space_id = H5Screate_simple(1, &dim, NULL); + CHECK(space_id, FAIL, "H5Screate_simple"); + dset_id = H5Dcreate(fid, DSET4_NAME, s1_tid, space_id, H5P_DEFAULT); + CHECK(dset_id, FAIL, "H5Dcreate"); + + /* Write data to the dataset. */ + ret = H5Dwrite(dset_id, s1_tid, H5S_ALL, H5S_ALL, H5P_DEFAULT, &s1); + CHECK(ret, FAIL, "H5Dwrite"); + + /* Ensure that data can be read back by field name into s2 struct */ + ret = H5Dread(dset_id, s2_tid, H5S_ALL, H5S_ALL, H5P_DEFAULT, &s2); + CHECK(ret, FAIL, "H5Dread"); + + VERIFY(s2.a, COMP_INT_VAL, "H5Dread"); + VERIFY(s2.c, COMP_DOUBLE_VAL, "H5Dread"); + + /* Clean up */ + ret = H5Tclose(s1_tid); + CHECK(ret, FAIL, "H5Tclose"); + ret = H5Tclose(s2_tid); + CHECK(ret, FAIL, "H5Tclose"); + ret = H5Sclose(space_id); + CHECK(ret, FAIL, "H5Sclose"); + ret = H5Dclose(dset_id); + CHECK(ret, FAIL, "H5Dclose"); +} + +/* + * test_enum + * Test that enumerated datatypes can have UTF-8 member names. + */ +void test_enum(hid_t UNUSED fid, const char * string) +{ + /* Define an enumerated type */ + typedef enum { + E1_RED, + E1_GREEN, + E1_BLUE, + E1_WHITE + } c_e1; + /* Variable declarations */ + c_e1 val; + herr_t ret; + hid_t type_id; + char readbuf[MAX_STRING_LENGTH]; + + /* Create an enumerated datatype in HDF5 with a UTF-8 member name*/ + type_id = H5Tcreate(H5T_ENUM, sizeof(c_e1)); + CHECK(type_id, FAIL, "H5Tcreate"); + val = E1_RED; + ret = H5Tenum_insert(type_id, "RED", &val); + CHECK(ret, FAIL, "H5Tenum_insert"); + val = E1_GREEN; + ret = H5Tenum_insert(type_id, "GREEN", &val); + CHECK(ret, FAIL, "H5Tenum_insert"); + val = E1_BLUE; + ret = H5Tenum_insert(type_id, "BLUE", &val); + CHECK(ret, FAIL, "H5Tenum_insert"); + val = E1_WHITE; + ret = H5Tenum_insert(type_id, string, &val); + CHECK(ret, FAIL, "H5Tenum_insert"); + + /* Ensure that UTF-8 member name gives the right value and vice versa. */ + ret = H5Tenum_valueof(type_id, string, &val); + CHECK(ret, FAIL, "H5Tenum_valueof"); + VERIFY(val, E1_WHITE, "H5Tenum_valueof"); + ret = H5Tenum_nameof(type_id, &val, readbuf, MAX_STRING_LENGTH); + CHECK(ret, FAIL, "H5Tenum_nameof"); + ret = strcmp(readbuf, string); + VERIFY(ret, 0, "strcmp"); + + /* Close the datatype */ + ret = H5Tclose(type_id); + CHECK(ret, FAIL, "H5Tclose"); +} + +/* + * test_opaque + * Test comments on opaque datatypes + */ +void test_opaque(hid_t UNUSED fid, const char * string) +{ + hid_t type_id; + char * read_buf; + herr_t ret; + + /* Create an opaque type and give it a UTF-8 tag */ + type_id = H5Tcreate(H5T_OPAQUE, 4); + CHECK(type_id, FAIL, "H5Tcreate"); + ret = H5Tset_tag(type_id, string); + CHECK(ret, FAIL, "H5Tset_tag"); + + /* Read the tag back. */ + read_buf = H5Tget_tag(type_id); + ret = strcmp(read_buf, string); + VERIFY(ret, 0, "H5Tget_tag"); + free(read_buf); + + ret = H5Tclose(type_id); + CHECK(ret, FAIL, "H5Tclose"); +} + +/*********************/ +/* Utility functions */ +/*********************/ + +/* mkstr + * Borrwed from dtypes.c. + * Creates a new string data type. Used in string padding tests */ +static hid_t mkstr(size_t len, H5T_str_t strpad) +{ + hid_t t; + if ((t=H5Tcopy(H5T_C_S1))<0) return -1; + if (H5Tset_size(t, len)<0) return -1; + if (H5Tset_strpad(t, strpad)<0) return -1; + return t; +} + +/* write_char + * Append a unicode code point c to test_string in UTF-8 encoding. + * Return the new end of the string. + */ +unsigned int write_char(unsigned int c, char * test_string, unsigned int cur_pos) +{ + if (c < 0x80) { + test_string[cur_pos] = c; + cur_pos++; + } + else if (c < 0x800) { + test_string[cur_pos] = (0xC0 | c>>6); + test_string[cur_pos+1] = (0x80 | (c & 0x3F)); + cur_pos += 2; + } + else if (c < 0x10000) { + test_string[cur_pos] = (0xE0 | c>>12); + test_string[cur_pos+1] = (0x80 | (c>>6 & 0x3F)); + test_string[cur_pos+2] = (0x80 | (c & 0x3F)); + cur_pos += 3; + } + else if (c < 0x200000) { + test_string[cur_pos] = (0xF0 | c>>18); + test_string[cur_pos+1] = (0x80 | (c>>12 & 0x3F)); + test_string[cur_pos+2] = (0x80 | (c>>6 & 0x3F)); + test_string[cur_pos+3] = (0x80 | (c & 0x3F)); + cur_pos += 4; + } + + return cur_pos; +} + +/* dump_string + * Print a string both as text (which will look like garbage) and as hex. + * The text display is not guaranteed to be accurate--certain characters + * could confuse printf (e.g., '\n'). */ +void dump_string(const char * string) +{ + unsigned int length; + unsigned int x; + + printf("The string was:\n"); + printf(string); + printf("Or in hex:\n"); + + length = strlen(string); + + for(x=0; x