summaryrefslogtreecommitdiffstats
path: root/src/H5FDs3comms.h
blob: 94fae7e90af89b748008a584fe9841ccb3c1511d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 * Copyright by The HDF Group.                                               *
 * All rights reserved.                                                      *
 *                                                                           *
 * This file is part of HDF5.  The full HDF5 copyright notice, including     *
 * terms governing use, modification, and redistribution, is contained in    *
 * the COPYING file, which can be found at the root of the source code       *
 * distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases.  *
 * If you do not have access to either file, you may request a copy from     *
 * help@hdfgroup.org.                                                        *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */

/*****************************************************************************
 * Read-Only S3 Virtual File Driver (VFD)                                   
 *
 * This is the header for the S3 Communications module
 *
 * ***NOT A FILE DRIVER***
 *
 * Purpose:
 *
 *     - Provide structures and functions related to communicating with
 *       Amazon S3 (Simple Storage Service).
 *     - Abstract away the REST API (HTTP,
 *       networked communications) behind a series of uniform function calls.
 *     - Handle AWS4 authentication, if appropriate.
 *     - Fail predictably in event of errors.
 *     - Eventually, support more S3 operations, such as creating, writing to,
 *       and removing Objects remotely.
 *
 *     translates:
 *     `read(some_file, bytes_offset, bytes_length, &dest_buffer);`
 *     to:
 *     ```
 *     GET myfile HTTP/1.1
 *     Host: somewhere.me
 *     Range: bytes=4096-5115
 *     ```
 *     and places received bytes from HTTP response...
 *     ```
 *     HTTP/1.1 206 Partial-Content
 *     Content-Range: 4096-5115/63239
 *
 *     <bytes>
 *     ```
 *     ...in destination buffer.
 *
 * TODO: put documentation in a consistent place and point to it from here.
 *
 * Programmer: Jacob Smith
 *             2017-11-30
 *
 *****************************************************************************/

#include "H5private.h"      /* Generic Functions        */

#ifdef H5_HAVE_ROS3_VFD

/* Necessary S3 headers */
#include <curl/curl.h>
#include <openssl/evp.h>
#include <openssl/hmac.h>
#include <openssl/sha.h>

/*****************
 * PUBLIC MACROS *
 *****************/

/* hexadecimal string of pre-computed sha256 checksum of the empty string
 * hex(sha256sum(""))
 */
#define EMPTY_SHA256 \
"e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"

/* string length (plus null terminator)
 * example ISO8601-format string: "20170713T145903Z" (YYYYmmdd'T'HHMMSS'_')
 */
#define ISO8601_SIZE 17

/* string length (plus null terminator)
 * example RFC7231-format string: "Fri, 30 Jun 2017 20:41:55 GMT"
 */
#define RFC7231_SIZE 30

/*---------------------------------------------------------------------------
 *
 * Macro: ISO8601NOW()
 *
 * Purpose:
 *
 *     write "YYYYmmdd'T'HHMMSS'Z'" (less single-quotes) to dest
 *     e.g., "20170630T204155Z"
 *
 *     wrapper for strftime()
 *
 *     It is left to the programmer to check return value of
 *     ISO8601NOW (should equal ISO8601_SIZE - 1).
 *
 *---------------------------------------------------------------------------
 */
#define ISO8601NOW(dest, now_gm) \
strftime((dest), ISO8601_SIZE, "%Y%m%dT%H%M%SZ", (now_gm))

/*---------------------------------------------------------------------------
 *
 * Macro: RFC7231NOW()
 *
 * Purpose:
 *
 *     write "Day, dd Mmm YYYY HH:MM:SS GMT" to dest
 *     e.g., "Fri, 30 Jun 2017 20:41:55 GMT"
 *
 *     wrapper for strftime()
 *
 *     It is left to the programmer to check return value of
 *     RFC7231NOW (should equal RFC7231_SIZE - 1).
 *
 *---------------------------------------------------------------------------
 */
#define RFC7231NOW(dest, now_gm) \
strftime((dest), RFC7231_SIZE, "%a, %d %b %Y %H:%M:%S GMT", (now_gm))


/* Reasonable maximum length of a credential string.
 * Provided for error-checking S3COMMS_FORMAT_CREDENTIAL (below).
 *  17 <- "////aws4_request\0"
 *   2 < "s3" (service)
 *   8 <- "YYYYmmdd" (date)
 * 128 <- (access_id)
 * 155 :: sum
 */
#define S3COMMS_MAX_CREDENTIAL_SIZE 155


/*---------------------------------------------------------------------------
 *
 * Macro: H5FD_S3COMMS_FORMAT_CREDENTIAL()
 *
 * Purpose:
 *
 *     Format "S3 Credential" string from inputs, for AWS4.
 *
 *     Wrapper for HDsnprintf().
 *
 *     _HAS NO ERROR-CHECKING FACILITIES_
 *     It is left to programmer to ensure that return value confers success.
 *     e.g.,
 *     ```
 *     assert( S3COMMS_MAX_CREDENTIAL_SIZE >=
 *             S3COMMS_FORMAT_CREDENTIAL(...) );
 *     ```
 *
 *     "<access-id>/<date>/<aws-region>/<aws-service>/aws4_request"
 *     assuming that `dest` has adequate space.
 *
 *     ALL inputs must be null-terminated strings.
 *
 *     `access` should be the user's access key ID.
 *     `date` must be of format "YYYYmmdd".
 *     `region` should be relevant AWS region, i.e. "us-east-1".
 *     `service` should be "s3".
 *
 *---------------------------------------------------------------------------
 */
#define S3COMMS_FORMAT_CREDENTIAL(dest, access, iso8601_date, region, service) \
HDsnprintf((dest), S3COMMS_MAX_CREDENTIAL_SIZE,                                  \
         "%s/%s/%s/%s/aws4_request",                                           \
         (access), (iso8601_date), (region), (service))

/*********************
 * PUBLIC STRUCTURES *
 *********************/


/*----------------------------------------------------------------------------
 *
 * Structure: hrb_node_t
 *
 * HTTP Header Field Node
 *
 *
 *
 * Maintain a ordered (linked) list of HTTP Header fields.
 *
 * Provides efficient access and manipulation of a logical sequence of
 * HTTP header fields, of particular use when composing an
 * "S3 Canonical Request" for authentication.
 *
 * - The creation of a Canoncial Request involves:
 *     - convert field names to lower case
 *     - sort by this lower-case name
 *     - convert ": " name-value separator in HTTP string to ":"
 *     - get sorted lowercase names without field or separator
 *
 * As HTTP headers allow headers in any order (excepting the case of multiple
 * headers with the same name), the list ordering can be optimized for Canonical
 * Request creation, suggesting alphabtical order. For more expedient insertion
 * and removal of elements in the list, linked list seems preferable to a
 * dynamically-expanding array. The usually-smaller number of entries (5 or
 * fewer) makes performance overhead of traversing the list trivial.
 *
 * The above requirements of creating at Canonical Request suggests a reasonable
 * trade-off of speed for space with the option to compute elements as needed
 * or to have the various elements prepared and stored in the structure
 * (e.g. name, value, lowername, concatenated name:value)
 * The structure currently is implemented to pre-compute.
 *
 * At all times, the "first" node of the list should be the least,
 * alphabetically. For all nodes, the `next` node should be either NULL or
 * of greater alphabetical value.
 *
 * Each node contains its own header field information, plus a pointer to the
 * next node.
 *
 * It is not allowed to have multiple nodes with the same _lowercase_ `name`s
 * in the same list
 * (i.e., name is case-insensitive for access and modification.)
 *
 * All data (`name`, `value`, `lowername`, and `cat`) are null-terminated
 * strings allocated specifically for their node.
 *
 *
 *
 * `magic` (unsigned long)
 *
 *     "unique" idenfier number for the structure type
 *
 * `name` (char *)
 *
 *     Case-meaningful name of the HTTP field.
 *     Given case is how it is supplied to networking code.
 *     e.g., "Range"
 *
 * `lowername` (char *)
 *
 *     Lowercase copy of name.
 *     e.g., "range"
 *
 * `value` (char *)
 *
 *     Case-meaningful value of HTTP field.
 *     e.g., "bytes=0-9"
 *
 * `cat` (char *)
 *
 *     Concatenated, null-terminated string of HTTP header line,
 *     as the field would appear in an HTTP request.
 *     e.g., "Range: bytes=0-9"
 *
 * `next` (hrb_node_t *)
 *
 *     Pointers to next node in the list, or NULL sentinel as end of list.
 *     Next node must have a greater `lowername` as determined by strcmp().
 *
 *----------------------------------------------------------------------------
 */
typedef struct hrb_node_t {
    unsigned long      magic;
    char              *name;
    char              *value;
    char              *cat;
    char              *lowername;
    struct hrb_node_t *next;
} hrb_node_t;
#define S3COMMS_HRB_NODE_MAGIC 0x7F5757UL


/*----------------------------------------------------------------------------
 *
 * Structure: hrb_t
 *
 * HTTP Request Buffer structure
 *
 *
 *
 * Logically represent an HTTP request
 *
 *     GET /myplace/myfile.h5 HTTP/1.1
 *     Host: over.rainbow.oz
 *     Date: Fri, 01 Dec 2017 12:35:04 CST
 *
 *     <body>
 *
 * ...with fast, efficient access to and modification of primary and field
 * elements.
 *
 * Structure for building HTTP requests while hiding much of the string
 * processing required "under the hood."
 *
 * Information about the request target -- the first line -- and the body text,
 * if any, are managed directly with this structure. All header fields, e.g.,
 * "Host" and "Date" above, are created with a linked list of `hrb_node_t` and
 * included in the request by a pointer to the head of the list.
 *
 *
 *
 * `magic` (unsigned long)
 *
 *     "Magic" number confirming that this is an hrb_t structure and
 *     what operations are valid for it.
 *
 *     Must be S3COMMS_HRB_MAGIC to be valid.
 *
 * `body` (char *) :
 *
 *     Pointer to start of HTTP body.
 *
 *     Can be NULL, in which case it is treated as the empty string, "".
 *
 * `body_len` (size_t) :
 *
 *     Number of bytes (characters) in `body`. 0 if empty or NULL `body`.
 *
 * `first_header` (hrb_node_t *) :
 *
 *     Pointer to first SORTED header node, if any.
 *     It is left to the programmer to ensure that this node and associated
 *     list is destroyed when done.
 *
 * `resource` (char *) :
 *
 *     Pointer to resource URL string, e.g., "/folder/page.xhtml".
 *
 * `verb` (char *) :
 *
 *     Pointer to HTTP verb string, e.g., "GET".
 *
 * `version` (char *) :
 *
 *     Pointer to HTTP version string, e.g., "HTTP/1.1".
 *
 *----------------------------------------------------------------------------
 */
typedef struct {
    unsigned long  magic;
    char          *body;
    size_t         body_len;
    hrb_node_t    *first_header;
    char          *resource;
    char          *verb;
    char          *version;
} hrb_t;
#define S3COMMS_HRB_MAGIC 0x6DCC84UL


/*----------------------------------------------------------------------------
 *
 * Structure: parsed_url_t
 *
 *
 * Represent a URL with easily-accessed pointers to logical elements within.
 * These elements (components) are stored as null-terminated strings (or just
 * NULLs). These components should be allocated for the structure, making the
 * data as safe as possible from modification. If a component is NULL, it is
 * either implicit in or absent from the URL.
 *
 * "http://mybucket.s3.amazonaws.com:8080/somefile.h5?param=value&arg=value"
 *  ^--^   ^-----------------------^ ^--^ ^---------^ ^-------------------^
 * Scheme             Host           Port  Resource        Query/-ies
 *
 *
 *
 * `magic` (unsigned long)
 *
 *     Structure identification and validation identifier.
 *     Identifies as `parsed_url_t` type.
 *
 * `scheme` (char *)
 *
 *     String representing which protocol is to be expected.
 *     _Must_ be present.
 *     "http", "https", "ftp", e.g.
 *
 * `host` (char *)
 *
 *     String of host, either domain name, IPv4, or IPv6 format.
 *     _Must_ be present.
 *     "over.rainbow.oz", "192.168.0.1", "[0000:0000:0000:0001]"
 *
 * `port` (char *)
 *
 *     String representation of specified port. Must resolve to a valid unsigned
 *     integer.
 *     "9000", "80"
 *
 * `path` (char *)
 *
 *     Path to resource on host. If not specified, assumes root "/".
 *     "lollipop_guild.wav", "characters/witches/white.dat"
 *
 * `query` (char *)
 *
 *     Single string of all query parameters in url (if any).
 *     "arg1=value1&arg2=value2"
 *
 *----------------------------------------------------------------------------
 */
typedef struct {
    unsigned long  magic;
    char          *scheme; /* required */
    char          *host;   /* required */
    char          *port;
    char          *path;
    char          *query;
} parsed_url_t;
#define S3COMMS_PARSED_URL_MAGIC 0x21D0DFUL


/*----------------------------------------------------------------------------
 *
 * Structure: s3r_t
 *
 *
 *
 * S3 request structure "handle".
 *
 * Holds persistent information for Amazon S3 requests.
 *
 * Instantiated through `H5FD_s3comms_s3r_open()`, copies data into self.
 *
 * Intended to be re-used for operations on a remote object.
 *
 * Cleaned up through `H5FD_s3comms_s3r_close()`.
 *
 * _DO NOT_ share handle between threads: curl easy handle `curlhandle` has
 * undefined behavior if called to perform in multiple threads.
 *
 *
 *
 * `magic` (unsigned long)
 *
 *     "magic" number identifying this structure as unique type.
 *     MUST equal `S3R_MAGIC` to be valid.
 *
 * `curlhandle` (CURL)
 *
 *     Pointer to the curl_easy handle generated for the request.
 *
 * `httpverb` (char *)
 *
 *     Pointer to NULL-terminated string. HTTP verb,
 *     e.g. "GET", "HEAD", "PUT", etc.
 *
 *     Default is NULL, resulting in a "GET" request.
 *
 * `purl` (parsed_url_t *)
 *
 *     Pointer to structure holding the elements of URL for file open.
 *
 *     e.g., "http://bucket.aws.com:8080/myfile.dat?q1=v1&q2=v2"
 *     parsed into...
 *     {   scheme: "http"
 *         host:   "bucket.aws.com"
 *         port:   "8080"
 *         path:   "myfile.dat"
 *         query:  "q1=v1&q2=v2"
 *     }
 *
 *     Cannot be NULL.
 *
 * `region` (char *)
 *
 *     Pointer to NULL-terminated string, specifying S3 "region",
 *     e.g., "us-east-1".
 *
 *     Required to authenticate.
 *
 * `secret_id` (char *)
 *
 *     Pointer to NULL-terminated string for "secret" access id to S3 resource.
 *
 *     Requred to authenticate.
 *
 * `signing_key` (unsigned char *)
 *
 *     Pointer to `SHA256_DIGEST_LENGTH`-long string for "re-usable" signing
 *     key, generated via
 *     `HMAC-SHA256(HMAC-SHA256(HMAC-SHA256(HMAC-SHA256("AWS4<secret_key>",
 *         "<yyyyMMDD"), "<aws-region>"), "<aws-service>"), "aws4_request")`
 *     which may be re-used for several (up to seven (7)) days from creation?
 *     Computed once upon file open.
 *
 *     Requred to authenticate.
 *
 *----------------------------------------------------------------------------
 */
typedef struct {
    unsigned long  magic;
    CURL          *curlhandle;
    size_t         filesize;
    char          *httpverb;
    parsed_url_t  *purl;
    char          *region;
    char          *secret_id;
    unsigned char *signing_key;
} s3r_t;

#define S3COMMS_S3R_MAGIC 0x44d8d79

#ifdef __cplusplus
extern "C" {
#endif

/*******************************************
 * DECLARATION OF HTTP FIELD LIST ROUTINES *
 *******************************************/

H5_DLL herr_t H5FD_s3comms_hrb_node_set(hrb_node_t **L,
                                 const char  *name,
                                 const char  *value);

/***********************************************
 * DECLARATION OF HTTP REQUEST BUFFER ROUTINES *
 ***********************************************/

H5_DLL herr_t H5FD_s3comms_hrb_destroy(hrb_t **buf);

H5_DLL hrb_t * H5FD_s3comms_hrb_init_request(const char *verb,
                                      const char *resource,
                                      const char *host);

/*************************************
 * DECLARATION OF S3REQUEST ROUTINES *
 *************************************/

H5_DLL herr_t H5FD_s3comms_s3r_close(s3r_t *handle);

H5_DLL size_t H5FD_s3comms_s3r_get_filesize(s3r_t *handle);

H5_DLL s3r_t * H5FD_s3comms_s3r_open(const char          url[],
                                     const char          region[],
                                     const char          id[],
                                     const unsigned char signing_key[]);

H5_DLL herr_t H5FD_s3comms_s3r_read(s3r_t   *handle,
                                    haddr_t  offset,
                                    size_t   len,
                                    void    *dest);

/*********************************
 * DECLARATION OF OTHER ROUTINES *
 *********************************/

H5_DLL struct tm * gmnow(void);

H5_DLL herr_t H5FD_s3comms_aws_canonical_request(char  *canonical_request_dest,
                                          int    cr_size,
                                          char  *signed_headers_dest,
                                          int    sh_size,
                                          hrb_t *http_request);

H5_DLL herr_t H5FD_s3comms_bytes_to_hex(char                *dest,
                                 const unsigned char *msg,
                                 size_t               msg_len,
                                 hbool_t              lowercase);

H5_DLL herr_t H5FD_s3comms_free_purl(parsed_url_t *purl);

H5_DLL herr_t H5FD_s3comms_HMAC_SHA256(const unsigned char *key,
                                size_t               key_len,
                                const char          *msg,
                                size_t               msg_len,
                                char                *dest);

H5_DLL herr_t H5FD_s3comms_load_aws_profile(const char *name,
                                     char       *key_id_out,
                                     char       *secret_access_key_out,
                                     char       *aws_region_out);

H5_DLL herr_t H5FD_s3comms_nlowercase(char       *dest,
                               const char *s,
                               size_t      len);

H5_DLL herr_t H5FD_s3comms_parse_url(const char    *str,
                              parsed_url_t **purl);

H5_DLL herr_t H5FD_s3comms_percent_encode_char(char                *repr,
                                        const unsigned char  c,
                                        size_t              *repr_len);

H5_DLL herr_t H5FD_s3comms_signing_key(unsigned char *md,
                                       const char    *secret,
                                       const char    *region,
                                       const char    *iso8601now);

H5_DLL herr_t H5FD_s3comms_tostringtosign(char       *dest,
                                   const char *req_str,
                                   const char *now,
                                   const char *region);

H5_DLL herr_t H5FD_s3comms_trim(char   *dest,
                         char   *s,
                         size_t  s_len,
                         size_t *n_written);

H5_DLL herr_t H5FD_s3comms_uriencode(char *dest, const char *s, size_t s_len,
    hbool_t encode_slash, size_t *n_written);

#ifdef __cplusplus
}
#endif

#endif /* H5_HAVE_ROS3_VFD */