HDF5  1.12.0
H5FDs3comms.h
Go to the documentation of this file.
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  * Copyright by The HDF Group. *
3  * All rights reserved. *
4  * *
5  * This file is part of HDF5. The full HDF5 copyright notice, including *
6  * terms governing use, modification, and redistribution, is contained in *
7  * the COPYING file, which can be found at the root of the source code *
8  * distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases. *
9  * If you do not have access to either file, you may request a copy from *
10  * help@hdfgroup.org. *
11  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
12 
13 /*****************************************************************************
14  * Read-Only S3 Virtual File Driver (VFD)
15  *
16  * This is the header for the S3 Communications module
17  *
18  * ***NOT A FILE DRIVER***
19  *
20  * Purpose:
21  *
22  * - Provide structures and functions related to communicating with
23  * Amazon S3 (Simple Storage Service).
24  * - Abstract away the REST API (HTTP,
25  * networked communications) behind a series of uniform function calls.
26  * - Handle AWS4 authentication, if appropriate.
27  * - Fail predictably in event of errors.
28  * - Eventually, support more S3 operations, such as creating, writing to,
29  * and removing Objects remotely.
30  *
31  * translates:
32  * `read(some_file, bytes_offset, bytes_length, &dest_buffer);`
33  * to:
34  * ```
35  * GET myfile HTTP/1.1
36  * Host: somewhere.me
37  * Range: bytes=4096-5115
38  * ```
39  * and places received bytes from HTTP response...
40  * ```
41  * HTTP/1.1 206 Partial-Content
42  * Content-Range: 4096-5115/63239
43  *
44  * <bytes>
45  * ```
46  * ...in destination buffer.
47  *
48  * TODO: put documentation in a consistent place and point to it from here.
49  *
50  * Programmer: Jacob Smith
51  * 2017-11-30
52  *
53  *****************************************************************************/
54 
55 #include "H5private.h" /* Generic Functions */
56 
57 #ifdef H5_HAVE_ROS3_VFD
58 
59 /* Necessary S3 headers */
60 #include <curl/curl.h>
61 #include <openssl/evp.h>
62 #include <openssl/hmac.h>
63 #include <openssl/sha.h>
64 
65 /*****************
66  * PUBLIC MACROS *
67  *****************/
68 
69 /* hexadecimal string of pre-computed sha256 checksum of the empty string
70  * hex(sha256sum(""))
71  */
72 #define EMPTY_SHA256 \
73 "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
74 
75 /* string length (plus null terminator)
76  * example ISO8601-format string: "20170713T145903Z" (YYYYmmdd'T'HHMMSS'_')
77  */
78 #define ISO8601_SIZE 17
79 
80 /* string length (plus null terminator)
81  * example RFC7231-format string: "Fri, 30 Jun 2017 20:41:55 GMT"
82  */
83 #define RFC7231_SIZE 30
84 
85 /*---------------------------------------------------------------------------
86  *
87  * Macro: ISO8601NOW()
88  *
89  * Purpose:
90  *
91  * write "YYYYmmdd'T'HHMMSS'Z'" (less single-quotes) to dest
92  * e.g., "20170630T204155Z"
93  *
94  * wrapper for strftime()
95  *
96  * It is left to the programmer to check return value of
97  * ISO8601NOW (should equal ISO8601_SIZE - 1).
98  *
99  *---------------------------------------------------------------------------
100  */
101 #define ISO8601NOW(dest, now_gm) \
102 strftime((dest), ISO8601_SIZE, "%Y%m%dT%H%M%SZ", (now_gm))
103 
104 /*---------------------------------------------------------------------------
105  *
106  * Macro: RFC7231NOW()
107  *
108  * Purpose:
109  *
110  * write "Day, dd Mmm YYYY HH:MM:SS GMT" to dest
111  * e.g., "Fri, 30 Jun 2017 20:41:55 GMT"
112  *
113  * wrapper for strftime()
114  *
115  * It is left to the programmer to check return value of
116  * RFC7231NOW (should equal RFC7231_SIZE - 1).
117  *
118  *---------------------------------------------------------------------------
119  */
120 #define RFC7231NOW(dest, now_gm) \
121 strftime((dest), RFC7231_SIZE, "%a, %d %b %Y %H:%M:%S GMT", (now_gm))
122 
123 
124 /* Reasonable maximum length of a credential string.
125  * Provided for error-checking S3COMMS_FORMAT_CREDENTIAL (below).
126  * 17 <- "////aws4_request\0"
127  * 2 < "s3" (service)
128  * 8 <- "YYYYmmdd" (date)
129  * 128 <- (access_id)
130  * 155 :: sum
131  */
132 #define S3COMMS_MAX_CREDENTIAL_SIZE 155
133 
134 
135 /*---------------------------------------------------------------------------
136  *
137  * Macro: H5FD_S3COMMS_FORMAT_CREDENTIAL()
138  *
139  * Purpose:
140  *
141  * Format "S3 Credential" string from inputs, for AWS4.
142  *
143  * Wrapper for HDsnprintf().
144  *
145  * _HAS NO ERROR-CHECKING FACILITIES_
146  * It is left to programmer to ensure that return value confers success.
147  * e.g.,
148  * ```
149  * assert( S3COMMS_MAX_CREDENTIAL_SIZE >=
150  * S3COMMS_FORMAT_CREDENTIAL(...) );
151  * ```
152  *
153  * "<access-id>/<date>/<aws-region>/<aws-service>/aws4_request"
154  * assuming that `dest` has adequate space.
155  *
156  * ALL inputs must be null-terminated strings.
157  *
158  * `access` should be the user's access key ID.
159  * `date` must be of format "YYYYmmdd".
160  * `region` should be relevant AWS region, i.e. "us-east-1".
161  * `service` should be "s3".
162  *
163  *---------------------------------------------------------------------------
164  */
165 #define S3COMMS_FORMAT_CREDENTIAL(dest, access, iso8601_date, region, service) \
166 HDsnprintf((dest), S3COMMS_MAX_CREDENTIAL_SIZE, \
167  "%s/%s/%s/%s/aws4_request", \
168  (access), (iso8601_date), (region), (service))
169 
170 /*********************
171  * PUBLIC STRUCTURES *
172  *********************/
173 
174 
175 /*----------------------------------------------------------------------------
176  *
177  * Structure: hrb_node_t
178  *
179  * HTTP Header Field Node
180  *
181  *
182  *
183  * Maintain a ordered (linked) list of HTTP Header fields.
184  *
185  * Provides efficient access and manipulation of a logical sequence of
186  * HTTP header fields, of particular use when composing an
187  * "S3 Canonical Request" for authentication.
188  *
189  * - The creation of a Canoncial Request involves:
190  * - convert field names to lower case
191  * - sort by this lower-case name
192  * - convert ": " name-value separator in HTTP string to ":"
193  * - get sorted lowercase names without field or separator
194  *
195  * As HTTP headers allow headers in any order (excepting the case of multiple
196  * headers with the same name), the list ordering can be optimized for Canonical
197  * Request creation, suggesting alphabtical order. For more expedient insertion
198  * and removal of elements in the list, linked list seems preferable to a
199  * dynamically-expanding array. The usually-smaller number of entries (5 or
200  * fewer) makes performance overhead of traversing the list trivial.
201  *
202  * The above requirements of creating at Canonical Request suggests a reasonable
203  * trade-off of speed for space with the option to compute elements as needed
204  * or to have the various elements prepared and stored in the structure
205  * (e.g. name, value, lowername, concatenated name:value)
206  * The structure currently is implemented to pre-compute.
207  *
208  * At all times, the "first" node of the list should be the least,
209  * alphabetically. For all nodes, the `next` node should be either NULL or
210  * of greater alphabetical value.
211  *
212  * Each node contains its own header field information, plus a pointer to the
213  * next node.
214  *
215  * It is not allowed to have multiple nodes with the same _lowercase_ `name`s
216  * in the same list
217  * (i.e., name is case-insensitive for access and modification.)
218  *
219  * All data (`name`, `value`, `lowername`, and `cat`) are null-terminated
220  * strings allocated specifically for their node.
221  *
222  *
223  *
224  * `magic` (unsigned long)
225  *
226  * "unique" idenfier number for the structure type
227  *
228  * `name` (char *)
229  *
230  * Case-meaningful name of the HTTP field.
231  * Given case is how it is supplied to networking code.
232  * e.g., "Range"
233  *
234  * `lowername` (char *)
235  *
236  * Lowercase copy of name.
237  * e.g., "range"
238  *
239  * `value` (char *)
240  *
241  * Case-meaningful value of HTTP field.
242  * e.g., "bytes=0-9"
243  *
244  * `cat` (char *)
245  *
246  * Concatenated, null-terminated string of HTTP header line,
247  * as the field would appear in an HTTP request.
248  * e.g., "Range: bytes=0-9"
249  *
250  * `next` (hrb_node_t *)
251  *
252  * Pointers to next node in the list, or NULL sentinel as end of list.
253  * Next node must have a greater `lowername` as determined by strcmp().
254  *
255  *----------------------------------------------------------------------------
256  */
257 typedef struct hrb_node_t {
258  unsigned long magic;
259  char *name;
260  char *value;
261  char *cat;
262  char *lowername;
263  struct hrb_node_t *next;
264 } hrb_node_t;
265 #define S3COMMS_HRB_NODE_MAGIC 0x7F5757UL
266 
267 
268 /*----------------------------------------------------------------------------
269  *
270  * Structure: hrb_t
271  *
272  * HTTP Request Buffer structure
273  *
274  *
275  *
276  * Logically represent an HTTP request
277  *
278  * GET /myplace/myfile.h5 HTTP/1.1
279  * Host: over.rainbow.oz
280  * Date: Fri, 01 Dec 2017 12:35:04 CST
281  *
282  * <body>
283  *
284  * ...with fast, efficient access to and modification of primary and field
285  * elements.
286  *
287  * Structure for building HTTP requests while hiding much of the string
288  * processing required "under the hood."
289  *
290  * Information about the request target -- the first line -- and the body text,
291  * if any, are managed directly with this structure. All header fields, e.g.,
292  * "Host" and "Date" above, are created with a linked list of `hrb_node_t` and
293  * included in the request by a pointer to the head of the list.
294  *
295  *
296  *
297  * `magic` (unsigned long)
298  *
299  * "Magic" number confirming that this is an hrb_t structure and
300  * what operations are valid for it.
301  *
302  * Must be S3COMMS_HRB_MAGIC to be valid.
303  *
304  * `body` (char *) :
305  *
306  * Pointer to start of HTTP body.
307  *
308  * Can be NULL, in which case it is treated as the empty string, "".
309  *
310  * `body_len` (size_t) :
311  *
312  * Number of bytes (characters) in `body`. 0 if empty or NULL `body`.
313  *
314  * `first_header` (hrb_node_t *) :
315  *
316  * Pointer to first SORTED header node, if any.
317  * It is left to the programmer to ensure that this node and associated
318  * list is destroyed when done.
319  *
320  * `resource` (char *) :
321  *
322  * Pointer to resource URL string, e.g., "/folder/page.xhtml".
323  *
324  * `verb` (char *) :
325  *
326  * Pointer to HTTP verb string, e.g., "GET".
327  *
328  * `version` (char *) :
329  *
330  * Pointer to HTTP version string, e.g., "HTTP/1.1".
331  *
332  *----------------------------------------------------------------------------
333  */
334 typedef struct {
335  unsigned long magic;
336  char *body;
337  size_t body_len;
338  hrb_node_t *first_header;
339  char *resource;
340  char *verb;
341  char *version;
342 } hrb_t;
343 #define S3COMMS_HRB_MAGIC 0x6DCC84UL
344 
345 
346 /*----------------------------------------------------------------------------
347  *
348  * Structure: parsed_url_t
349  *
350  *
351  * Represent a URL with easily-accessed pointers to logical elements within.
352  * These elements (components) are stored as null-terminated strings (or just
353  * NULLs). These components should be allocated for the structure, making the
354  * data as safe as possible from modification. If a component is NULL, it is
355  * either implicit in or absent from the URL.
356  *
357  * "http://mybucket.s3.amazonaws.com:8080/somefile.h5?param=value&arg=value"
358  * ^--^ ^-----------------------^ ^--^ ^---------^ ^-------------------^
359  * Scheme Host Port Resource Query/-ies
360  *
361  *
362  *
363  * `magic` (unsigned long)
364  *
365  * Structure identification and validation identifier.
366  * Identifies as `parsed_url_t` type.
367  *
368  * `scheme` (char *)
369  *
370  * String representing which protocol is to be expected.
371  * _Must_ be present.
372  * "http", "https", "ftp", e.g.
373  *
374  * `host` (char *)
375  *
376  * String of host, either domain name, IPv4, or IPv6 format.
377  * _Must_ be present.
378  * "over.rainbow.oz", "192.168.0.1", "[0000:0000:0000:0001]"
379  *
380  * `port` (char *)
381  *
382  * String representation of specified port. Must resolve to a valid unsigned
383  * integer.
384  * "9000", "80"
385  *
386  * `path` (char *)
387  *
388  * Path to resource on host. If not specified, assumes root "/".
389  * "lollipop_guild.wav", "characters/witches/white.dat"
390  *
391  * `query` (char *)
392  *
393  * Single string of all query parameters in url (if any).
394  * "arg1=value1&arg2=value2"
395  *
396  *----------------------------------------------------------------------------
397  */
398 typedef struct {
399  unsigned long magic;
400  char *scheme; /* required */
401  char *host; /* required */
402  char *port;
403  char *path;
404  char *query;
405 } parsed_url_t;
406 #define S3COMMS_PARSED_URL_MAGIC 0x21D0DFUL
407 
408 
409 /*----------------------------------------------------------------------------
410  *
411  * Structure: s3r_t
412  *
413  *
414  *
415  * S3 request structure "handle".
416  *
417  * Holds persistent information for Amazon S3 requests.
418  *
419  * Instantiated through `H5FD_s3comms_s3r_open()`, copies data into self.
420  *
421  * Intended to be re-used for operations on a remote object.
422  *
423  * Cleaned up through `H5FD_s3comms_s3r_close()`.
424  *
425  * _DO NOT_ share handle between threads: curl easy handle `curlhandle` has
426  * undefined behavior if called to perform in multiple threads.
427  *
428  *
429  *
430  * `magic` (unsigned long)
431  *
432  * "magic" number identifying this structure as unique type.
433  * MUST equal `S3R_MAGIC` to be valid.
434  *
435  * `curlhandle` (CURL)
436  *
437  * Pointer to the curl_easy handle generated for the request.
438  *
439  * `httpverb` (char *)
440  *
441  * Pointer to NULL-terminated string. HTTP verb,
442  * e.g. "GET", "HEAD", "PUT", etc.
443  *
444  * Default is NULL, resulting in a "GET" request.
445  *
446  * `purl` (parsed_url_t *)
447  *
448  * Pointer to structure holding the elements of URL for file open.
449  *
450  * e.g., "http://bucket.aws.com:8080/myfile.dat?q1=v1&q2=v2"
451  * parsed into...
452  * { scheme: "http"
453  * host: "bucket.aws.com"
454  * port: "8080"
455  * path: "myfile.dat"
456  * query: "q1=v1&q2=v2"
457  * }
458  *
459  * Cannot be NULL.
460  *
461  * `region` (char *)
462  *
463  * Pointer to NULL-terminated string, specifying S3 "region",
464  * e.g., "us-east-1".
465  *
466  * Required to authenticate.
467  *
468  * `secret_id` (char *)
469  *
470  * Pointer to NULL-terminated string for "secret" access id to S3 resource.
471  *
472  * Requred to authenticate.
473  *
474  * `signing_key` (unsigned char *)
475  *
476  * Pointer to `SHA256_DIGEST_LENGTH`-long string for "re-usable" signing
477  * key, generated via
478  * `HMAC-SHA256(HMAC-SHA256(HMAC-SHA256(HMAC-SHA256("AWS4<secret_key>",
479  * "<yyyyMMDD"), "<aws-region>"), "<aws-service>"), "aws4_request")`
480  * which may be re-used for several (up to seven (7)) days from creation?
481  * Computed once upon file open.
482  *
483  * Requred to authenticate.
484  *
485  *----------------------------------------------------------------------------
486  */
487 typedef struct {
488  unsigned long magic;
489  CURL *curlhandle;
490  size_t filesize;
491  char *httpverb;
492  parsed_url_t *purl;
493  char *region;
494  char *secret_id;
495  unsigned char *signing_key;
496 } s3r_t;
497 
498 #define S3COMMS_S3R_MAGIC 0x44d8d79
499 
500 #ifdef __cplusplus
501 extern "C" {
502 #endif
503 
504 /*******************************************
505  * DECLARATION OF HTTP FIELD LIST ROUTINES *
506  *******************************************/
507 
508 H5_DLL herr_t H5FD_s3comms_hrb_node_set(hrb_node_t **L,
509  const char *name,
510  const char *value);
511 
512 /***********************************************
513  * DECLARATION OF HTTP REQUEST BUFFER ROUTINES *
514  ***********************************************/
515 
516 H5_DLL herr_t H5FD_s3comms_hrb_destroy(hrb_t **buf);
517 
518 H5_DLL hrb_t * H5FD_s3comms_hrb_init_request(const char *verb,
519  const char *resource,
520  const char *host);
521 
522 /*************************************
523  * DECLARATION OF S3REQUEST ROUTINES *
524  *************************************/
525 
526 H5_DLL herr_t H5FD_s3comms_s3r_close(s3r_t *handle);
527 
528 H5_DLL size_t H5FD_s3comms_s3r_get_filesize(s3r_t *handle);
529 
530 H5_DLL s3r_t * H5FD_s3comms_s3r_open(const char url[],
531  const char region[],
532  const char id[],
533  const unsigned char signing_key[]);
534 
535 H5_DLL herr_t H5FD_s3comms_s3r_read(s3r_t *handle,
536  haddr_t offset,
537  size_t len,
538  void *dest);
539 
540 /*********************************
541  * DECLARATION OF OTHER ROUTINES *
542  *********************************/
543 
544 H5_DLL struct tm * gmnow(void);
545 
546 H5_DLL herr_t H5FD_s3comms_aws_canonical_request(char *canonical_request_dest,
547  int cr_size,
548  char *signed_headers_dest,
549  int sh_size,
550  hrb_t *http_request);
551 
552 H5_DLL herr_t H5FD_s3comms_bytes_to_hex(char *dest,
553  const unsigned char *msg,
554  size_t msg_len,
555  hbool_t lowercase);
556 
557 H5_DLL herr_t H5FD_s3comms_free_purl(parsed_url_t *purl);
558 
559 H5_DLL herr_t H5FD_s3comms_HMAC_SHA256(const unsigned char *key,
560  size_t key_len,
561  const char *msg,
562  size_t msg_len,
563  char *dest);
564 
565 H5_DLL herr_t H5FD_s3comms_load_aws_profile(const char *name,
566  char *key_id_out,
567  char *secret_access_key_out,
568  char *aws_region_out);
569 
570 H5_DLL herr_t H5FD_s3comms_nlowercase(char *dest,
571  const char *s,
572  size_t len);
573 
574 H5_DLL herr_t H5FD_s3comms_parse_url(const char *str,
575  parsed_url_t **purl);
576 
577 H5_DLL herr_t H5FD_s3comms_percent_encode_char(char *repr,
578  const unsigned char c,
579  size_t *repr_len);
580 
581 H5_DLL herr_t H5FD_s3comms_signing_key(unsigned char *md,
582  const char *secret,
583  const char *region,
584  const char *iso8601now);
585 
586 H5_DLL herr_t H5FD_s3comms_tostringtosign(char *dest,
587  const char *req_str,
588  const char *now,
589  const char *region);
590 
591 H5_DLL herr_t H5FD_s3comms_trim(char *dest,
592  char *s,
593  size_t s_len,
594  size_t *n_written);
595 
596 H5_DLL herr_t H5FD_s3comms_uriencode(char *dest, const char *s, size_t s_len,
597  hbool_t encode_slash, size_t *n_written);
598 
599 #ifdef __cplusplus
600 }
601 #endif
602 
603 #endif /* H5_HAVE_ROS3_VFD */
604 
haddr_t
CATCH haddr_t
Definition: H5EAdblock.c:162
path
H5T_path_t ** path
Definition: H5T.c:558
H5_DLL
#define H5_DLL
Definition: H5api_adpt.h:234
herr_t
int herr_t
Definition: H5public.h:128
hbool_t
bool hbool_t
Definition: H5public.h:159
H5private.h