diff options
Diffstat (limited to 'src/H5FDs3comms.h')
-rw-r--r-- | src/H5FDs3comms.h | 634 |
1 files changed, 634 insertions, 0 deletions
diff --git a/src/H5FDs3comms.h b/src/H5FDs3comms.h new file mode 100644 index 0000000..0524c46 --- /dev/null +++ b/src/H5FDs3comms.h @@ -0,0 +1,634 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Read-Only S3 Virtual File Driver (VFD) * + * Copyright (c) 2017-2018, The HDF Group. * + * * + * All rights reserved. * + * * + * NOTICE: * + * All information contained herein is, and remains, the property of The HDF * + * Group. The intellectual and technical concepts contained herein are * + * proprietary to The HDF Group. Dissemination of this information or * + * reproduction of this material is strictly forbidden unless prior written * + * permission is obtained from The HDF Group. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +/***************************************************************************** + * + * This is the header for the S3 Communications module + * + * ***NOT A FILE DRIVER*** + * + * Purpose: + * + * - Provide structures and functions related to communicating with + * Amazon S3 (Simple Storage Service). + * - Abstract away the REST API (HTTP, + * networked communications) behind a series of uniform function calls. + * - Handle AWS4 authentication, if appropriate. + * - Fail predictably in event of errors. + * - Eventually, support more S3 operations, such as creating, writing to, + * and removing Objects remotely. + * + * translates: + * `read(some_file, bytes_offset, bytes_length, &dest_buffer);` + * to: + * ``` + * GET myfile HTTP/1.1 + * Host: somewhere.me + * Range: bytes=4096-5115 + * ``` + * and places received bytes from HTTP response... + * ``` + * HTTP/1.1 206 Partial-Content + * Content-Range: 4096-5115/63239 + * + * <bytes> + * ``` + * ...in destination buffer. + * + * TODO: put documentation in a consistent place and point to it from here. + * + * Programmer: Jacob Smith + * 2017-11-30 + * + *****************************************************************************/ + +#include <ctype.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> + +#ifdef H5_HAVE_ROS3_VFD +#include <curl/curl.h> +#include <openssl/evp.h> +#include <openssl/hmac.h> +#include <openssl/sha.h> +#endif /* ifdef H5_HAVE_ROS3_VFD */ + +/***************** + * PUBLIC MACROS * + *****************/ + +/* hexadecimal string of pre-computed sha256 checksum of the empty string + * hex(sha256sum("")) + */ +#define EMPTY_SHA256 \ +"e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" + +/* string length (plus null terminator) + * example ISO8601-format string: "20170713T145903Z" (YYYYmmdd'T'HHMMSS'_') + */ +#define ISO8601_SIZE 17 + +/* string length (plus null terminator) + * example RFC7231-format string: "Fri, 30 Jun 2017 20:41:55 GMT" + */ +#define RFC7231_SIZE 30 + +/*--------------------------------------------------------------------------- + * + * Macro: ISO8601NOW() + * + * Purpose: + * + * write "YYYYmmdd'T'HHMMSS'Z'" (less single-quotes) to dest + * e.g., "20170630T204155Z" + * + * wrapper for strftime() + * + * It is left to the programmer to check return value of + * ISO8601NOW (should equal ISO8601_SIZE - 1). + * + * Programmer: Jacob Smith + * 2017-07-?? + * + *--------------------------------------------------------------------------- + */ +#define ISO8601NOW(dest, now_gm) \ +strftime((dest), ISO8601_SIZE, "%Y%m%dT%H%M%SZ", (now_gm)) + +/*--------------------------------------------------------------------------- + * + * Macro: RFC7231NOW() + * + * Purpose: + * + * write "Day, dd Mmm YYYY HH:MM:SS GMT" to dest + * e.g., "Fri, 30 Jun 2017 20:41:55 GMT" + * + * wrapper for strftime() + * + * It is left to the programmer to check return value of + * RFC7231NOW (should equal RFC7231_SIZE - 1). + * + * Programmer: Jacob Smith + * 2017-07-?? + * + *--------------------------------------------------------------------------- + */ +#define RFC7231NOW(dest, now_gm) \ +strftime((dest), RFC7231_SIZE, "%a, %d %b %Y %H:%M:%S GMT", (now_gm)) + + +/* Reasonable maximum length of a credential string. + * Provided for error-checking S3COMMS_FORMAT_CREDENTIAL (below). + * 17 <- "////aws4_request\0" + * 2 < "s3" (service) + * 8 <- "YYYYmmdd" (date) + * 128 <- (access_id) + * 155 :: sum + */ +#define S3COMMS_MAX_CREDENTIAL_SIZE 155 + + +/*--------------------------------------------------------------------------- + * + * Macro: H5FD_S3COMMS_FORMAT_CREDENTIAL() + * + * Purpose: + * + * Format "S3 Credential" string from inputs, for AWS4. + * + * Wrapper for HDsnprintf(). + * + * _HAS NO ERROR-CHECKING FACILITIES_ + * It is left to programmer to ensure that return value confers success. + * e.g., + * ``` + * assert( S3COMMS_MAX_CREDENTIAL_SIZE >= + * S3COMMS_FORMAT_CREDENTIAL(...) ); + * ``` + * + * "<access-id>/<date>/<aws-region>/<aws-service>/aws4_request" + * assuming that `dest` has adequate space. + * + * ALL inputs must be null-terminated strings. + * + * `access` should be the user's access key ID. + * `date` must be of format "YYYYmmdd". + * `region` should be relevant AWS region, i.e. "us-east-1". + * `service` should be "s3". + * + * Programmer: Jacob Smith + * 2017-09-19 + * + * Changes: None. + * + *--------------------------------------------------------------------------- + */ +#define S3COMMS_FORMAT_CREDENTIAL(dest, access, iso8601_date, region, service) \ +HDsnprintf((dest), S3COMMS_MAX_CREDENTIAL_SIZE, \ + "%s/%s/%s/%s/aws4_request", \ + (access), (iso8601_date), (region), (service)) + +/********************* + * PUBLIC STRUCTURES * + *********************/ + + +/*---------------------------------------------------------------------------- + * + * Structure: hrb_node_t + * + * HTTP Header Field Node + * + * + * + * Maintain a ordered (linked) list of HTTP Header fields. + * + * Provides efficient access and manipulation of a logical sequence of + * HTTP header fields, of particular use when composing an + * "S3 Canonical Request" for authentication. + * + * - The creation of a Canoncial Request involves: + * - convert field names to lower case + * - sort by this lower-case name + * - convert ": " name-value separator in HTTP string to ":" + * - get sorted lowercase names without field or separator + * + * As HTTP headers allow headers in any order (excepting the case of multiple + * headers with the same name), the list ordering can be optimized for Canonical + * Request creation, suggesting alphabtical order. For more expedient insertion + * and removal of elements in the list, linked list seems preferable to a + * dynamically-expanding array. The usually-smaller number of entries (5 or + * fewer) makes performance overhead of traversing the list trivial. + * + * The above requirements of creating at Canonical Request suggests a reasonable + * trade-off of speed for space with the option to compute elements as needed + * or to have the various elements prepared and stored in the structure + * (e.g. name, value, lowername, concatenated name:value) + * The structure currently is implemented to pre-compute. + * + * At all times, the "first" node of the list should be the least, + * alphabetically. For all nodes, the `next` node should be either NULL or + * of greater alphabetical value. + * + * Each node contains its own header field information, plus a pointer to the + * next node. + * + * It is not allowed to have multiple nodes with the same _lowercase_ `name`s + * in the same list + * (i.e., name is case-insensitive for access and modification.) + * + * All data (`name`, `value`, `lowername`, and `cat`) are null-terminated + * strings allocated specifically for their node. + * + * + * + * `magic` (unsigned long) + * + * "unique" idenfier number for the structure type + * + * `name` (char *) + * + * Case-meaningful name of the HTTP field. + * Given case is how it is supplied to networking code. + * e.g., "Range" + * + * `lowername` (char *) + * + * Lowercase copy of name. + * e.g., "range" + * + * `value` (char *) + * + * Case-meaningful value of HTTP field. + * e.g., "bytes=0-9" + * + * `cat` (char *) + * + * Concatenated, null-terminated string of HTTP header line, + * as the field would appear in an HTTP request. + * e.g., "Range: bytes=0-9" + * + * `next` (hrb_node_t *) + * + * Pointers to next node in the list, or NULL sentinel as end of list. + * Next node must have a greater `lowername` as determined by strcmp(). + * + * + * + * Programmer: Jacob Smith + * 2017-09-22 + * + * Changes: + * + * - Change from twin doubly-linked lists to singly-linked list. + * --- Jake Smith 2017-01-17 + * + *---------------------------------------------------------------------------- + */ +typedef struct hrb_node_t { + unsigned long magic; + char *name; + char *value; + char *cat; + char *lowername; + struct hrb_node_t *next; +} hrb_node_t; +#define S3COMMS_HRB_NODE_MAGIC 0x7F5757UL + + +/*---------------------------------------------------------------------------- + * + * Structure: hrb_t + * + * HTTP Request Buffer structure + * + * + * + * Logically represent an HTTP request + * + * GET /myplace/myfile.h5 HTTP/1.1 + * Host: over.rainbow.oz + * Date: Fri, 01 Dec 2017 12:35:04 CST + * + * <body> + * + * ...with fast, efficient access to and modification of primary and field + * elements. + * + * Structure for building HTTP requests while hiding much of the string + * processing required "under the hood." + * + * Information about the request target -- the first line -- and the body text, + * if any, are managed directly with this structure. All header fields, e.g., + * "Host" and "Date" above, are created with a linked list of `hrb_node_t` and + * included in the request by a pointer to the head of the list. + * + * + * + * `magic` (unsigned long) + * + * "Magic" number confirming that this is an hrb_t structure and + * what operations are valid for it. + * + * Must be S3COMMS_HRB_MAGIC to be valid. + * + * `body` (char *) : + * + * Pointer to start of HTTP body. + * + * Can be NULL, in which case it is treated as the empty string, "". + * + * `body_len` (size_t) : + * + * Number of bytes (characters) in `body`. 0 if empty or NULL `body`. + * + * `first_header` (hrb_node_t *) : + * + * Pointer to first SORTED header node, if any. + * It is left to the programmer to ensure that this node and associated + * list is destroyed when done. + * + * `resource` (char *) : + * + * Pointer to resource URL string, e.g., "/folder/page.xhtml". + * + * `verb` (char *) : + * + * Pointer to HTTP verb string, e.g., "GET". + * + * `version` (char *) : + * + * Pointer to HTTP version string, e.g., "HTTP/1.1". + * + * + * + * Programmer: Jacob Smith + * + *---------------------------------------------------------------------------- + */ +typedef struct { + unsigned long magic; + char *body; + size_t body_len; + hrb_node_t *first_header; + char *resource; + char *verb; + char *version; +} hrb_t; +#define S3COMMS_HRB_MAGIC 0x6DCC84UL + + +/*---------------------------------------------------------------------------- + * + * Structure: parsed_url_t + * + * + * Represent a URL with easily-accessed pointers to logical elements within. + * These elements (components) are stored as null-terminated strings (or just + * NULLs). These components should be allocated for the structure, making the + * data as safe as possible from modification. If a component is NULL, it is + * either implicit in or absent from the URL. + * + * "http://mybucket.s3.amazonaws.com:8080/somefile.h5?param=value&arg=value" + * ^--^ ^-----------------------^ ^--^ ^---------^ ^-------------------^ + * Scheme Host Port Resource Query/-ies + * + * + * + * `magic` (unsigned long) + * + * Structure identification and validation identifier. + * Identifies as `parsed_url_t` type. + * + * `scheme` (char *) + * + * String representing which protocol is to be expected. + * _Must_ be present. + * "http", "https", "ftp", e.g. + * + * `host` (char *) + * + * String of host, either domain name, IPv4, or IPv6 format. + * _Must_ be present. + * "over.rainbow.oz", "192.168.0.1", "[0000:0000:0000:0001]" + * + * `port` (char *) + * + * String representation of specified port. Must resolve to a valid unsigned + * integer. + * "9000", "80" + * + * `path` (char *) + * + * Path to resource on host. If not specified, assumes root "/". + * "lollipop_guild.wav", "characters/witches/white.dat" + * + * `query` (char *) + * + * Single string of all query parameters in url (if any). + * "arg1=value1&arg2=value2" + * + * + * + * Programmer: Jacob Smith + * + *---------------------------------------------------------------------------- + */ +typedef struct { + unsigned long magic; + char *scheme; /* required */ + char *host; /* required */ + char *port; + char *path; + char *query; +} parsed_url_t; +#define S3COMMS_PARSED_URL_MAGIC 0x21D0DFUL + + +/*---------------------------------------------------------------------------- + * + * Structure: s3r_t + * + * + * + * S3 request structure "handle". + * + * Holds persistent information for Amazon S3 requests. + * + * Instantiated through `H5FD_s3comms_s3r_open()`, copies data into self. + * + * Intended to be re-used for operations on a remote object. + * + * Cleaned up through `H5FD_s3comms_s3r_close()`. + * + * _DO NOT_ share handle between threads: curl easy handle `curlhandle` has + * undefined behavior if called to perform in multiple threads. + * + * + * + * `magic` (unsigned long) + * + * "magic" number identifying this structure as unique type. + * MUST equal `S3R_MAGIC` to be valid. + * + * `curlhandle` (CURL) + * + * Pointer to the curl_easy handle generated for the request. + * + * `httpverb` (char *) + * + * Pointer to NULL-terminated string. HTTP verb, + * e.g. "GET", "HEAD", "PUT", etc. + * + * Default is NULL, resulting in a "GET" request. + * + * `purl` (parsed_url_t *) + * + * Pointer to structure holding the elements of URL for file open. + * + * e.g., "http://bucket.aws.com:8080/myfile.dat?q1=v1&q2=v2" + * parsed into... + * { scheme: "http" + * host: "bucket.aws.com" + * port: "8080" + * path: "myfile.dat" + * query: "q1=v1&q2=v2" + * } + * + * Cannot be NULL. + * + * `region` (char *) + * + * Pointer to NULL-terminated string, specifying S3 "region", + * e.g., "us-east-1". + * + * Required to authenticate. + * + * `secret_id` (char *) + * + * Pointer to NULL-terminated string for "secret" access id to S3 resource. + * + * Requred to authenticate. + * + * `signing_key` (unsigned char *) + * + * Pointer to `SHA256_DIGEST_LENGTH`-long string for "re-usable" signing + * key, generated via + * `HMAC-SHA256(HMAC-SHA256(HMAC-SHA256(HMAC-SHA256("AWS4<secret_key>", + * "<yyyyMMDD"), "<aws-region>"), "<aws-service>"), "aws4_request")` + * which may be re-used for several (up to seven (7)) days from creation? + * Computed once upon file open. + * + * Requred to authenticate. + * + * + * + * Programmer: Jacob Smith + * + *---------------------------------------------------------------------------- + */ +typedef struct { + unsigned long magic; +#ifdef H5_HAVE_ROS3_VFD + CURL *curlhandle; + size_t filesize; + char *httpverb; + parsed_url_t *purl; + char *region; + char *secret_id; + unsigned char *signing_key; +#endif /* ifdef H5_HAVE_ROS3_VFD */ +} s3r_t; +#define S3COMMS_S3R_MAGIC 0x44d8d79 + +/******************************************* + * DECLARATION OF HTTP FIELD LIST ROUTINES * + *******************************************/ + +herr_t H5FD_s3comms_hrb_node_set(hrb_node_t **L, + const char *name, + const char *value); + +/*********************************************** + * DECLARATION OF HTTP REQUEST BUFFER ROUTINES * + ***********************************************/ + +herr_t H5FD_s3comms_hrb_destroy(hrb_t **buf); + +hrb_t * H5FD_s3comms_hrb_init_request(const char *verb, + const char *resource, + const char *host); + +/************************************* + * DECLARATION OF S3REQUEST ROUTINES * + *************************************/ + +H5_DLL herr_t H5FD_s3comms_s3r_close(s3r_t *handle); + +H5_DLL size_t H5FD_s3comms_s3r_get_filesize(s3r_t *handle); + +H5_DLL s3r_t * H5FD_s3comms_s3r_open(const char url[], + const char region[], + const char id[], + const unsigned char signing_key[]); + +H5_DLL herr_t H5FD_s3comms_s3r_read(s3r_t *handle, + haddr_t offset, + size_t len, + void *dest); + +/********************************* + * DECLARATION OF OTHER ROUTINES * + *********************************/ + +H5_DLL struct tm * gmnow(void); + +herr_t H5FD_s3comms_aws_canonical_request(char *canonical_request_dest, + char *signed_headers_dest, + hrb_t *http_request); + +H5_DLL herr_t H5FD_s3comms_bytes_to_hex(char *dest, + const unsigned char *msg, + size_t msg_len, + hbool_t lowercase); + +herr_t H5FD_s3comms_free_purl(parsed_url_t *purl); + +herr_t H5FD_s3comms_HMAC_SHA256(const unsigned char *key, + size_t key_len, + const char *msg, + size_t msg_len, + char *dest); + +herr_t H5FD_s3comms_load_aws_profile(const char *name, + char *key_id_out, + char *secret_access_key_out, + char *aws_region_out); + +herr_t H5FD_s3comms_nlowercase(char *dest, + const char *s, + size_t len); + +herr_t H5FD_s3comms_parse_url(const char *str, + parsed_url_t **purl); + +herr_t H5FD_s3comms_percent_encode_char(char *repr, + const unsigned char c, + size_t *repr_len); + +H5_DLL herr_t H5FD_s3comms_signing_key(unsigned char *md, + const char *secret, + const char *region, + const char *iso8601now); + +herr_t H5FD_s3comms_tostringtosign(char *dest, + const char *req_str, + const char *now, + const char *region); + +H5_DLL herr_t H5FD_s3comms_trim(char *dest, + char *s, + size_t s_len, + size_t *n_written); + +H5_DLL herr_t H5FD_s3comms_uriencode(char *dest, + const char *s, + size_t s_len, + hbool_t encode_slash, + size_t *n_written); + + |