summaryrefslogtreecommitdiffstats
path: root/src/H5FDs3comms.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/H5FDs3comms.h')
-rw-r--r--src/H5FDs3comms.h634
1 files changed, 634 insertions, 0 deletions
diff --git a/src/H5FDs3comms.h b/src/H5FDs3comms.h
new file mode 100644
index 0000000..0524c46
--- /dev/null
+++ b/src/H5FDs3comms.h
@@ -0,0 +1,634 @@
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ * Read-Only S3 Virtual File Driver (VFD) *
+ * Copyright (c) 2017-2018, The HDF Group. *
+ * *
+ * All rights reserved. *
+ * *
+ * NOTICE: *
+ * All information contained herein is, and remains, the property of The HDF *
+ * Group. The intellectual and technical concepts contained herein are *
+ * proprietary to The HDF Group. Dissemination of this information or *
+ * reproduction of this material is strictly forbidden unless prior written *
+ * permission is obtained from The HDF Group. *
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+/*****************************************************************************
+ *
+ * This is the header for the S3 Communications module
+ *
+ * ***NOT A FILE DRIVER***
+ *
+ * Purpose:
+ *
+ * - Provide structures and functions related to communicating with
+ * Amazon S3 (Simple Storage Service).
+ * - Abstract away the REST API (HTTP,
+ * networked communications) behind a series of uniform function calls.
+ * - Handle AWS4 authentication, if appropriate.
+ * - Fail predictably in event of errors.
+ * - Eventually, support more S3 operations, such as creating, writing to,
+ * and removing Objects remotely.
+ *
+ * translates:
+ * `read(some_file, bytes_offset, bytes_length, &dest_buffer);`
+ * to:
+ * ```
+ * GET myfile HTTP/1.1
+ * Host: somewhere.me
+ * Range: bytes=4096-5115
+ * ```
+ * and places received bytes from HTTP response...
+ * ```
+ * HTTP/1.1 206 Partial-Content
+ * Content-Range: 4096-5115/63239
+ *
+ * <bytes>
+ * ```
+ * ...in destination buffer.
+ *
+ * TODO: put documentation in a consistent place and point to it from here.
+ *
+ * Programmer: Jacob Smith
+ * 2017-11-30
+ *
+ *****************************************************************************/
+
+#include <ctype.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#ifdef H5_HAVE_ROS3_VFD
+#include <curl/curl.h>
+#include <openssl/evp.h>
+#include <openssl/hmac.h>
+#include <openssl/sha.h>
+#endif /* ifdef H5_HAVE_ROS3_VFD */
+
+/*****************
+ * PUBLIC MACROS *
+ *****************/
+
+/* hexadecimal string of pre-computed sha256 checksum of the empty string
+ * hex(sha256sum(""))
+ */
+#define EMPTY_SHA256 \
+"e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
+
+/* string length (plus null terminator)
+ * example ISO8601-format string: "20170713T145903Z" (YYYYmmdd'T'HHMMSS'_')
+ */
+#define ISO8601_SIZE 17
+
+/* string length (plus null terminator)
+ * example RFC7231-format string: "Fri, 30 Jun 2017 20:41:55 GMT"
+ */
+#define RFC7231_SIZE 30
+
+/*---------------------------------------------------------------------------
+ *
+ * Macro: ISO8601NOW()
+ *
+ * Purpose:
+ *
+ * write "YYYYmmdd'T'HHMMSS'Z'" (less single-quotes) to dest
+ * e.g., "20170630T204155Z"
+ *
+ * wrapper for strftime()
+ *
+ * It is left to the programmer to check return value of
+ * ISO8601NOW (should equal ISO8601_SIZE - 1).
+ *
+ * Programmer: Jacob Smith
+ * 2017-07-??
+ *
+ *---------------------------------------------------------------------------
+ */
+#define ISO8601NOW(dest, now_gm) \
+strftime((dest), ISO8601_SIZE, "%Y%m%dT%H%M%SZ", (now_gm))
+
+/*---------------------------------------------------------------------------
+ *
+ * Macro: RFC7231NOW()
+ *
+ * Purpose:
+ *
+ * write "Day, dd Mmm YYYY HH:MM:SS GMT" to dest
+ * e.g., "Fri, 30 Jun 2017 20:41:55 GMT"
+ *
+ * wrapper for strftime()
+ *
+ * It is left to the programmer to check return value of
+ * RFC7231NOW (should equal RFC7231_SIZE - 1).
+ *
+ * Programmer: Jacob Smith
+ * 2017-07-??
+ *
+ *---------------------------------------------------------------------------
+ */
+#define RFC7231NOW(dest, now_gm) \
+strftime((dest), RFC7231_SIZE, "%a, %d %b %Y %H:%M:%S GMT", (now_gm))
+
+
+/* Reasonable maximum length of a credential string.
+ * Provided for error-checking S3COMMS_FORMAT_CREDENTIAL (below).
+ * 17 <- "////aws4_request\0"
+ * 2 < "s3" (service)
+ * 8 <- "YYYYmmdd" (date)
+ * 128 <- (access_id)
+ * 155 :: sum
+ */
+#define S3COMMS_MAX_CREDENTIAL_SIZE 155
+
+
+/*---------------------------------------------------------------------------
+ *
+ * Macro: H5FD_S3COMMS_FORMAT_CREDENTIAL()
+ *
+ * Purpose:
+ *
+ * Format "S3 Credential" string from inputs, for AWS4.
+ *
+ * Wrapper for HDsnprintf().
+ *
+ * _HAS NO ERROR-CHECKING FACILITIES_
+ * It is left to programmer to ensure that return value confers success.
+ * e.g.,
+ * ```
+ * assert( S3COMMS_MAX_CREDENTIAL_SIZE >=
+ * S3COMMS_FORMAT_CREDENTIAL(...) );
+ * ```
+ *
+ * "<access-id>/<date>/<aws-region>/<aws-service>/aws4_request"
+ * assuming that `dest` has adequate space.
+ *
+ * ALL inputs must be null-terminated strings.
+ *
+ * `access` should be the user's access key ID.
+ * `date` must be of format "YYYYmmdd".
+ * `region` should be relevant AWS region, i.e. "us-east-1".
+ * `service` should be "s3".
+ *
+ * Programmer: Jacob Smith
+ * 2017-09-19
+ *
+ * Changes: None.
+ *
+ *---------------------------------------------------------------------------
+ */
+#define S3COMMS_FORMAT_CREDENTIAL(dest, access, iso8601_date, region, service) \
+HDsnprintf((dest), S3COMMS_MAX_CREDENTIAL_SIZE, \
+ "%s/%s/%s/%s/aws4_request", \
+ (access), (iso8601_date), (region), (service))
+
+/*********************
+ * PUBLIC STRUCTURES *
+ *********************/
+
+
+/*----------------------------------------------------------------------------
+ *
+ * Structure: hrb_node_t
+ *
+ * HTTP Header Field Node
+ *
+ *
+ *
+ * Maintain a ordered (linked) list of HTTP Header fields.
+ *
+ * Provides efficient access and manipulation of a logical sequence of
+ * HTTP header fields, of particular use when composing an
+ * "S3 Canonical Request" for authentication.
+ *
+ * - The creation of a Canoncial Request involves:
+ * - convert field names to lower case
+ * - sort by this lower-case name
+ * - convert ": " name-value separator in HTTP string to ":"
+ * - get sorted lowercase names without field or separator
+ *
+ * As HTTP headers allow headers in any order (excepting the case of multiple
+ * headers with the same name), the list ordering can be optimized for Canonical
+ * Request creation, suggesting alphabtical order. For more expedient insertion
+ * and removal of elements in the list, linked list seems preferable to a
+ * dynamically-expanding array. The usually-smaller number of entries (5 or
+ * fewer) makes performance overhead of traversing the list trivial.
+ *
+ * The above requirements of creating at Canonical Request suggests a reasonable
+ * trade-off of speed for space with the option to compute elements as needed
+ * or to have the various elements prepared and stored in the structure
+ * (e.g. name, value, lowername, concatenated name:value)
+ * The structure currently is implemented to pre-compute.
+ *
+ * At all times, the "first" node of the list should be the least,
+ * alphabetically. For all nodes, the `next` node should be either NULL or
+ * of greater alphabetical value.
+ *
+ * Each node contains its own header field information, plus a pointer to the
+ * next node.
+ *
+ * It is not allowed to have multiple nodes with the same _lowercase_ `name`s
+ * in the same list
+ * (i.e., name is case-insensitive for access and modification.)
+ *
+ * All data (`name`, `value`, `lowername`, and `cat`) are null-terminated
+ * strings allocated specifically for their node.
+ *
+ *
+ *
+ * `magic` (unsigned long)
+ *
+ * "unique" idenfier number for the structure type
+ *
+ * `name` (char *)
+ *
+ * Case-meaningful name of the HTTP field.
+ * Given case is how it is supplied to networking code.
+ * e.g., "Range"
+ *
+ * `lowername` (char *)
+ *
+ * Lowercase copy of name.
+ * e.g., "range"
+ *
+ * `value` (char *)
+ *
+ * Case-meaningful value of HTTP field.
+ * e.g., "bytes=0-9"
+ *
+ * `cat` (char *)
+ *
+ * Concatenated, null-terminated string of HTTP header line,
+ * as the field would appear in an HTTP request.
+ * e.g., "Range: bytes=0-9"
+ *
+ * `next` (hrb_node_t *)
+ *
+ * Pointers to next node in the list, or NULL sentinel as end of list.
+ * Next node must have a greater `lowername` as determined by strcmp().
+ *
+ *
+ *
+ * Programmer: Jacob Smith
+ * 2017-09-22
+ *
+ * Changes:
+ *
+ * - Change from twin doubly-linked lists to singly-linked list.
+ * --- Jake Smith 2017-01-17
+ *
+ *----------------------------------------------------------------------------
+ */
+typedef struct hrb_node_t {
+ unsigned long magic;
+ char *name;
+ char *value;
+ char *cat;
+ char *lowername;
+ struct hrb_node_t *next;
+} hrb_node_t;
+#define S3COMMS_HRB_NODE_MAGIC 0x7F5757UL
+
+
+/*----------------------------------------------------------------------------
+ *
+ * Structure: hrb_t
+ *
+ * HTTP Request Buffer structure
+ *
+ *
+ *
+ * Logically represent an HTTP request
+ *
+ * GET /myplace/myfile.h5 HTTP/1.1
+ * Host: over.rainbow.oz
+ * Date: Fri, 01 Dec 2017 12:35:04 CST
+ *
+ * <body>
+ *
+ * ...with fast, efficient access to and modification of primary and field
+ * elements.
+ *
+ * Structure for building HTTP requests while hiding much of the string
+ * processing required "under the hood."
+ *
+ * Information about the request target -- the first line -- and the body text,
+ * if any, are managed directly with this structure. All header fields, e.g.,
+ * "Host" and "Date" above, are created with a linked list of `hrb_node_t` and
+ * included in the request by a pointer to the head of the list.
+ *
+ *
+ *
+ * `magic` (unsigned long)
+ *
+ * "Magic" number confirming that this is an hrb_t structure and
+ * what operations are valid for it.
+ *
+ * Must be S3COMMS_HRB_MAGIC to be valid.
+ *
+ * `body` (char *) :
+ *
+ * Pointer to start of HTTP body.
+ *
+ * Can be NULL, in which case it is treated as the empty string, "".
+ *
+ * `body_len` (size_t) :
+ *
+ * Number of bytes (characters) in `body`. 0 if empty or NULL `body`.
+ *
+ * `first_header` (hrb_node_t *) :
+ *
+ * Pointer to first SORTED header node, if any.
+ * It is left to the programmer to ensure that this node and associated
+ * list is destroyed when done.
+ *
+ * `resource` (char *) :
+ *
+ * Pointer to resource URL string, e.g., "/folder/page.xhtml".
+ *
+ * `verb` (char *) :
+ *
+ * Pointer to HTTP verb string, e.g., "GET".
+ *
+ * `version` (char *) :
+ *
+ * Pointer to HTTP version string, e.g., "HTTP/1.1".
+ *
+ *
+ *
+ * Programmer: Jacob Smith
+ *
+ *----------------------------------------------------------------------------
+ */
+typedef struct {
+ unsigned long magic;
+ char *body;
+ size_t body_len;
+ hrb_node_t *first_header;
+ char *resource;
+ char *verb;
+ char *version;
+} hrb_t;
+#define S3COMMS_HRB_MAGIC 0x6DCC84UL
+
+
+/*----------------------------------------------------------------------------
+ *
+ * Structure: parsed_url_t
+ *
+ *
+ * Represent a URL with easily-accessed pointers to logical elements within.
+ * These elements (components) are stored as null-terminated strings (or just
+ * NULLs). These components should be allocated for the structure, making the
+ * data as safe as possible from modification. If a component is NULL, it is
+ * either implicit in or absent from the URL.
+ *
+ * "http://mybucket.s3.amazonaws.com:8080/somefile.h5?param=value&arg=value"
+ * ^--^ ^-----------------------^ ^--^ ^---------^ ^-------------------^
+ * Scheme Host Port Resource Query/-ies
+ *
+ *
+ *
+ * `magic` (unsigned long)
+ *
+ * Structure identification and validation identifier.
+ * Identifies as `parsed_url_t` type.
+ *
+ * `scheme` (char *)
+ *
+ * String representing which protocol is to be expected.
+ * _Must_ be present.
+ * "http", "https", "ftp", e.g.
+ *
+ * `host` (char *)
+ *
+ * String of host, either domain name, IPv4, or IPv6 format.
+ * _Must_ be present.
+ * "over.rainbow.oz", "192.168.0.1", "[0000:0000:0000:0001]"
+ *
+ * `port` (char *)
+ *
+ * String representation of specified port. Must resolve to a valid unsigned
+ * integer.
+ * "9000", "80"
+ *
+ * `path` (char *)
+ *
+ * Path to resource on host. If not specified, assumes root "/".
+ * "lollipop_guild.wav", "characters/witches/white.dat"
+ *
+ * `query` (char *)
+ *
+ * Single string of all query parameters in url (if any).
+ * "arg1=value1&arg2=value2"
+ *
+ *
+ *
+ * Programmer: Jacob Smith
+ *
+ *----------------------------------------------------------------------------
+ */
+typedef struct {
+ unsigned long magic;
+ char *scheme; /* required */
+ char *host; /* required */
+ char *port;
+ char *path;
+ char *query;
+} parsed_url_t;
+#define S3COMMS_PARSED_URL_MAGIC 0x21D0DFUL
+
+
+/*----------------------------------------------------------------------------
+ *
+ * Structure: s3r_t
+ *
+ *
+ *
+ * S3 request structure "handle".
+ *
+ * Holds persistent information for Amazon S3 requests.
+ *
+ * Instantiated through `H5FD_s3comms_s3r_open()`, copies data into self.
+ *
+ * Intended to be re-used for operations on a remote object.
+ *
+ * Cleaned up through `H5FD_s3comms_s3r_close()`.
+ *
+ * _DO NOT_ share handle between threads: curl easy handle `curlhandle` has
+ * undefined behavior if called to perform in multiple threads.
+ *
+ *
+ *
+ * `magic` (unsigned long)
+ *
+ * "magic" number identifying this structure as unique type.
+ * MUST equal `S3R_MAGIC` to be valid.
+ *
+ * `curlhandle` (CURL)
+ *
+ * Pointer to the curl_easy handle generated for the request.
+ *
+ * `httpverb` (char *)
+ *
+ * Pointer to NULL-terminated string. HTTP verb,
+ * e.g. "GET", "HEAD", "PUT", etc.
+ *
+ * Default is NULL, resulting in a "GET" request.
+ *
+ * `purl` (parsed_url_t *)
+ *
+ * Pointer to structure holding the elements of URL for file open.
+ *
+ * e.g., "http://bucket.aws.com:8080/myfile.dat?q1=v1&q2=v2"
+ * parsed into...
+ * { scheme: "http"
+ * host: "bucket.aws.com"
+ * port: "8080"
+ * path: "myfile.dat"
+ * query: "q1=v1&q2=v2"
+ * }
+ *
+ * Cannot be NULL.
+ *
+ * `region` (char *)
+ *
+ * Pointer to NULL-terminated string, specifying S3 "region",
+ * e.g., "us-east-1".
+ *
+ * Required to authenticate.
+ *
+ * `secret_id` (char *)
+ *
+ * Pointer to NULL-terminated string for "secret" access id to S3 resource.
+ *
+ * Requred to authenticate.
+ *
+ * `signing_key` (unsigned char *)
+ *
+ * Pointer to `SHA256_DIGEST_LENGTH`-long string for "re-usable" signing
+ * key, generated via
+ * `HMAC-SHA256(HMAC-SHA256(HMAC-SHA256(HMAC-SHA256("AWS4<secret_key>",
+ * "<yyyyMMDD"), "<aws-region>"), "<aws-service>"), "aws4_request")`
+ * which may be re-used for several (up to seven (7)) days from creation?
+ * Computed once upon file open.
+ *
+ * Requred to authenticate.
+ *
+ *
+ *
+ * Programmer: Jacob Smith
+ *
+ *----------------------------------------------------------------------------
+ */
+typedef struct {
+ unsigned long magic;
+#ifdef H5_HAVE_ROS3_VFD
+ CURL *curlhandle;
+ size_t filesize;
+ char *httpverb;
+ parsed_url_t *purl;
+ char *region;
+ char *secret_id;
+ unsigned char *signing_key;
+#endif /* ifdef H5_HAVE_ROS3_VFD */
+} s3r_t;
+#define S3COMMS_S3R_MAGIC 0x44d8d79
+
+/*******************************************
+ * DECLARATION OF HTTP FIELD LIST ROUTINES *
+ *******************************************/
+
+herr_t H5FD_s3comms_hrb_node_set(hrb_node_t **L,
+ const char *name,
+ const char *value);
+
+/***********************************************
+ * DECLARATION OF HTTP REQUEST BUFFER ROUTINES *
+ ***********************************************/
+
+herr_t H5FD_s3comms_hrb_destroy(hrb_t **buf);
+
+hrb_t * H5FD_s3comms_hrb_init_request(const char *verb,
+ const char *resource,
+ const char *host);
+
+/*************************************
+ * DECLARATION OF S3REQUEST ROUTINES *
+ *************************************/
+
+H5_DLL herr_t H5FD_s3comms_s3r_close(s3r_t *handle);
+
+H5_DLL size_t H5FD_s3comms_s3r_get_filesize(s3r_t *handle);
+
+H5_DLL s3r_t * H5FD_s3comms_s3r_open(const char url[],
+ const char region[],
+ const char id[],
+ const unsigned char signing_key[]);
+
+H5_DLL herr_t H5FD_s3comms_s3r_read(s3r_t *handle,
+ haddr_t offset,
+ size_t len,
+ void *dest);
+
+/*********************************
+ * DECLARATION OF OTHER ROUTINES *
+ *********************************/
+
+H5_DLL struct tm * gmnow(void);
+
+herr_t H5FD_s3comms_aws_canonical_request(char *canonical_request_dest,
+ char *signed_headers_dest,
+ hrb_t *http_request);
+
+H5_DLL herr_t H5FD_s3comms_bytes_to_hex(char *dest,
+ const unsigned char *msg,
+ size_t msg_len,
+ hbool_t lowercase);
+
+herr_t H5FD_s3comms_free_purl(parsed_url_t *purl);
+
+herr_t H5FD_s3comms_HMAC_SHA256(const unsigned char *key,
+ size_t key_len,
+ const char *msg,
+ size_t msg_len,
+ char *dest);
+
+herr_t H5FD_s3comms_load_aws_profile(const char *name,
+ char *key_id_out,
+ char *secret_access_key_out,
+ char *aws_region_out);
+
+herr_t H5FD_s3comms_nlowercase(char *dest,
+ const char *s,
+ size_t len);
+
+herr_t H5FD_s3comms_parse_url(const char *str,
+ parsed_url_t **purl);
+
+herr_t H5FD_s3comms_percent_encode_char(char *repr,
+ const unsigned char c,
+ size_t *repr_len);
+
+H5_DLL herr_t H5FD_s3comms_signing_key(unsigned char *md,
+ const char *secret,
+ const char *region,
+ const char *iso8601now);
+
+herr_t H5FD_s3comms_tostringtosign(char *dest,
+ const char *req_str,
+ const char *now,
+ const char *region);
+
+H5_DLL herr_t H5FD_s3comms_trim(char *dest,
+ char *s,
+ size_t s_len,
+ size_t *n_written);
+
+H5_DLL herr_t H5FD_s3comms_uriencode(char *dest,
+ const char *s,
+ size_t s_len,
+ hbool_t encode_slash,
+ size_t *n_written);
+
+