HDF5
1.12.0
hdf5-1.12.0
src
H5ACpkg.h
Go to the documentation of this file.
1
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2
* Copyright by The HDF Group. *
3
* Copyright by the Board of Trustees of the University of Illinois. *
4
* All rights reserved. *
5
* *
6
* This file is part of HDF5. The full HDF5 copyright notice, including *
7
* terms governing use, modification, and redistribution, is contained in *
8
* the COPYING file, which can be found at the root of the source code *
9
* distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases. *
10
* If you do not have access to either file, you may request a copy from *
11
* help@hdfgroup.org. *
12
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
13
14
/*
15
* Programmer: John Mainzer -- 4/19/06
16
*
17
* Purpose: This file contains declarations which are normally visible
18
* only within the H5AC package (just H5AC.c at present).
19
*
20
* Source files outside the H5AC package should include
21
* H5ACprivate.h instead.
22
*
23
* The one exception to this rule is testpar/t_cache.c. The
24
* test code is easier to write if it can look at H5AC_aux_t.
25
* Indeed, this is the main reason why this file was created.
26
*
27
*/
28
29
#if !(defined H5AC_FRIEND || defined H5AC_MODULE)
30
#error "Do not include this file outside the H5AC package!"
31
#endif
32
33
#ifndef _H5ACpkg_H
34
#define _H5ACpkg_H
35
36
/* Get package's private header */
37
#include "
H5ACprivate.h
"
/* Metadata cache */
38
39
40
/* Get needed headers */
41
#include "
H5Cprivate.h
"
/* Cache */
42
#include "
H5FLprivate.h
"
/* Free Lists */
43
44
/*****************************/
45
/* Package Private Variables */
46
/*****************************/
47
48
/* Declare extern the free list to manage the H5AC_aux_t struct */
49
H5FL_EXTERN
(H5AC_aux_t);
50
51
52
/**************************/
53
/* Package Private Macros */
54
/**************************/
55
56
#define H5AC_DEBUG_DIRTY_BYTES_CREATION 0
57
58
#ifdef H5_HAVE_PARALLEL
59
60
/* the following #defined are used to specify the operation required
61
* at a sync point.
62
*/
63
64
#define H5AC_SYNC_POINT_OP__FLUSH_TO_MIN_CLEAN 0
65
#define H5AC_SYNC_POINT_OP__FLUSH_CACHE 1
66
67
#endif
/* H5_HAVE_PARALLEL */
68
69
/*-------------------------------------------------------------------------
70
* It is a bit difficult to set ranges of allowable values on the
71
* dirty_bytes_threshold field of H5AC_aux_t. The following are
72
* probably broader than they should be.
73
*-------------------------------------------------------------------------
74
*/
75
76
#define H5AC__MIN_DIRTY_BYTES_THRESHOLD (size_t) \
77
(H5C__MIN_MAX_CACHE_SIZE / 2)
78
#define H5AC__DEFAULT_DIRTY_BYTES_THRESHOLD (256 * 1024)
79
#define H5AC__MAX_DIRTY_BYTES_THRESHOLD (size_t) \
80
(H5C__MAX_MAX_CACHE_SIZE / 4)
81
82
83
/****************************************************************************
84
*
85
* structure H5AC_aux_t
86
*
87
* While H5AC has become a wrapper for the cache implemented in H5C.c, there
88
* are some features of the metadata cache that are specific to it, and which
89
* therefore do not belong in the more generic H5C cache code.
90
*
91
* In particular, there is the matter of synchronizing writes from the
92
* metadata cache to disk in the PHDF5 case.
93
*
94
* Prior to this update, the presumption was that all metadata caches would
95
* write the same data at the same time since all operations modifying
96
* metadata must be performed collectively. Given this assumption, it was
97
* safe to allow only the writes from process 0 to actually make it to disk,
98
* while metadata writes from all other processes were discarded.
99
*
100
* Unfortunately, this presumption is in error as operations that read
101
* metadata need not be collective, but can change the location of dirty
102
* entries in the metadata cache LRU lists. This can result in the same
103
* metadata write operation triggering writes from the metadata caches on
104
* some processes, but not all (causing a hang), or in different sets of
105
* entries being written from different caches (potentially resulting in
106
* metadata corruption in the file).
107
*
108
* To deal with this issue, I decided to apply a paradigm shift to the way
109
* metadata is written to disk.
110
*
111
* With this set of changes, only the metadata cache on process 0 is able
112
* to write metadata to disk, although metadata caches on all other
113
* processes can read metadata from disk as before.
114
*
115
* To keep all the other caches from getting plugged up with dirty metadata,
116
* process 0 periodically broadcasts a list of entries that it has flushed
117
* since that last notice, and which are currently clean. The other caches
118
* mark these entries as clean as well, which allows them to evict the
119
* entries as needed.
120
*
121
* One obvious problem in this approach is synchronizing the broadcasts
122
* and receptions, as different caches may see different amounts of
123
* activity.
124
*
125
* The current solution is for the caches to track the number of bytes
126
* of newly generated dirty metadata, and to broadcast and receive
127
* whenever this value exceeds some user specified threshold.
128
*
129
* Maintaining this count is easy for all processes not on process 0 --
130
* all that is necessary is to add the size of the entry to the total
131
* whenever there is an insertion, a move of a previously clean entry,
132
* or whever a previously clean entry is marked dirty in an unprotect.
133
*
134
* On process 0, we have to be careful not to count dirty bytes twice.
135
* If an entry is marked dirty, flushed, and marked dirty again, all
136
* within a single reporting period, it only th first marking should
137
* be added to the dirty bytes generated tally, as that is all that
138
* the other processes will see.
139
*
140
* At present, this structure exists to maintain the fields needed to
141
* implement the above scheme, and thus is only used in the parallel
142
* case. However, other uses may arise in the future.
143
*
144
* Instance of this structure are associated with metadata caches via
145
* the aux_ptr field of H5C_t (see H5Cpkg.h). The H5AC code is
146
* responsible for allocating, maintaining, and discarding instances
147
* of H5AC_aux_t.
148
*
149
* The remainder of this header comments documents the individual fields
150
* of the structure.
151
*
152
* JRM - 6/27/05
153
*
154
* Update: When the above was written, I planned to allow the process
155
* 0 metadata cache to write dirty metadata between sync points.
156
* However, testing indicated that this allowed occasional
157
* messages from the future to reach the caches on other processes.
158
*
159
* To resolve this, the code was altered to require that all metadata
160
* writes take place during sync points -- which solved the problem.
161
* Initially all writes were performed by the process 0 cache. This
162
* approach was later replaced with a distributed write approach
163
* in which each process writes a subset of the metadata to be
164
* written.
165
*
166
* After thinking on the matter for a while, I arrived at the
167
* conclusion that the process 0 cache could be allowed to write
168
* dirty metadata between sync points if it restricted itself to
169
* entries that had been dirty at the time of the previous sync point.
170
*
171
* To date, there has been no attempt to implement this optimization.
172
* However, should it be attempted, much of the supporting code
173
* should still be around.
174
*
175
* JRM -- 1/6/15
176
*
177
* magic: Unsigned 32 bit integer always set to
178
* H5AC__H5AC_AUX_T_MAGIC. This field is used to validate
179
* pointers to instances of H5AC_aux_t.
180
*
181
* mpi_comm: MPI communicator associated with the file for which the
182
* cache has been created.
183
*
184
* mpi_rank: MPI rank of this process within mpi_comm.
185
*
186
* mpi_size: Number of processes in mpi_comm.
187
*
188
* write_permitted: Boolean flag used to control whether the cache
189
* is permitted to write to file.
190
*
191
* dirty_bytes_threshold: Integer field containing the dirty bytes
192
* generation threshold. Whenever dirty byte creation
193
* exceeds this value, the metadata cache on process 0
194
* broadcasts a list of the entries it has flushed since
195
* the last broadcast (or since the beginning of execution)
196
* and which are currently clean (if they are still in the
197
* cache)
198
*
199
* Similarly, metadata caches on processes other than process
200
* 0 will attempt to receive a list of clean entries whenever
201
* the threshold is exceeded.
202
*
203
* dirty_bytes: Integer field containing the number of bytes of dirty
204
* metadata generated since the beginning of the computation,
205
* or (more typically) since the last clean entries list
206
* broadcast. This field is reset to zero after each such
207
* broadcast.
208
*
209
* metadata_write_strategy: Integer code indicating how we will be
210
* writing the metadata. In the first incarnation of
211
* this code, all writes were done from process 0. This
212
* field exists to facilitate experiments with other
213
* strategies.
214
*
215
* At present, this field must be set to either
216
* H5AC_METADATA_WRITE_STRATEGY__PROCESS_0_ONLY or
217
* H5AC_METADATA_WRITE_STRATEGY__DISTRIBUTED.
218
*
219
* dirty_bytes_propagations: This field only exists when the
220
* H5AC_DEBUG_DIRTY_BYTES_CREATION #define is TRUE.
221
*
222
* It is used to track the number of times the cleaned list
223
* has been propagated from process 0 to the other
224
* processes.
225
*
226
* unprotect_dirty_bytes: This field only exists when the
227
* H5AC_DEBUG_DIRTY_BYTES_CREATION #define is TRUE.
228
*
229
* It is used to track the number of dirty bytes created
230
* via unprotect operations since the last time the cleaned
231
* list was propagated.
232
*
233
* unprotect_dirty_bytes_updates: This field only exists when the
234
* H5AC_DEBUG_DIRTY_BYTES_CREATION #define is TRUE.
235
*
236
* It is used to track the number of times dirty bytes have
237
* been created via unprotect operations since the last time
238
* the cleaned list was propagated.
239
*
240
* insert_dirty_bytes: This field only exists when the
241
* H5AC_DEBUG_DIRTY_BYTES_CREATION #define is TRUE.
242
*
243
* It is used to track the number of dirty bytes created
244
* via insert operations since the last time the cleaned
245
* list was propagated.
246
*
247
* insert_dirty_bytes_updates: This field only exists when the
248
* H5AC_DEBUG_DIRTY_BYTES_CREATION #define is TRUE.
249
*
250
* It is used to track the number of times dirty bytes have
251
* been created via insert operations since the last time
252
* the cleaned list was propagated.
253
*
254
* move_dirty_bytes: This field only exists when the
255
* H5AC_DEBUG_DIRTY_BYTES_CREATION #define is TRUE.
256
*
257
* It is used to track the number of dirty bytes created
258
* via move operations since the last time the cleaned
259
* list was propagated.
260
*
261
* move_dirty_bytes_updates: This field only exists when the
262
* H5AC_DEBUG_DIRTY_BYTES_CREATION #define is TRUE.
263
*
264
* It is used to track the number of times dirty bytes have
265
* been created via move operations since the last time
266
* the cleaned list was propagated.
267
*
268
* Things have changed a bit since the following four fields were defined.
269
* If metadata_write_strategy is H5AC_METADATA_WRITE_STRATEGY__PROCESS_0_ONLY,
270
* all comments hold as before -- with the caviate that pending further
271
* coding, the process 0 metadata cache is forbidden to flush entries outside
272
* of a sync point.
273
*
274
* However, for different metadata write strategies, these fields are used
275
* only to maintain the correct dirty byte count on process zero -- and in
276
* most if not all cases, this is redundant, as process zero will be barred
277
* from flushing entries outside of a sync point.
278
*
279
* JRM -- 3/16/10
280
*
281
* d_slist_ptr: Pointer to an instance of H5SL_t used to maintain a list
282
* of entries that have been dirtied since the last time they
283
* were listed in a clean entries broadcast. This list is
284
* only maintained by the metadata cache on process 0 -- it
285
* it used to maintain a view of the dirty entries as seen
286
* by the other caches, so as to keep the dirty bytes count
287
* in synchronization with them.
288
*
289
* Thus on process 0, the dirty_bytes count is incremented
290
* only if either
291
*
292
* 1) an entry is inserted in the metadata cache, or
293
*
294
* 2) a previously clean entry is moved, and it does not
295
* already appear in the dirty entry list, or
296
*
297
* 3) a previously clean entry is unprotected with the
298
* dirtied flag set and the entry does not already appear
299
* in the dirty entry list.
300
*
301
* Entries are added to the dirty entry list whever they cause
302
* the dirty bytes count to be increased. They are removed
303
* when they appear in a clean entries broadcast. Note that
304
* moves must be reflected in the dirty entry list.
305
*
306
* To reitterate, this field is only used on process 0 -- it
307
* should be NULL on all other processes.
308
*
309
* c_slist_ptr: Pointer to an instance of H5SL_t used to maintain a list
310
* of entries that were dirty, have been flushed
311
* to disk since the last clean entries broadcast, and are
312
* still clean. Since only process 0 can write to disk, this
313
* list only exists on process 0.
314
*
315
* In essence, this slist is used to assemble the contents of
316
* the next clean entries broadcast. The list emptied after
317
* each broadcast.
318
*
319
* The following two fields are used only when metadata_write_strategy
320
* is H5AC_METADATA_WRITE_STRATEGY__DISTRIBUTED.
321
*
322
* candidate_slist_ptr: Pointer to an instance of H5SL_t used by process 0
323
* to construct a list of entries to be flushed at this sync
324
* point. This list is then broadcast to the other processes,
325
* which then either flush or mark clean all entries on it.
326
*
327
* write_done: In the parallel test bed, it is necessary to ensure that
328
* all writes to the server process from cache 0 complete
329
* before it enters the barrier call with the other caches.
330
*
331
* The write_done callback allows t_cache to do this without
332
* requiring an ACK on each write. Since these ACKs greatly
333
* increase the run time on some platforms, this is a
334
* significant optimization.
335
*
336
* This field must be set to NULL when the callback is not
337
* needed.
338
*
339
* Note: This field has been extended for use by all processes
340
* with the addition of support for the distributed
341
* metadata write strategy.
342
* JRM -- 5/9/10
343
*
344
* sync_point_done: In the parallel test bed, it is necessary to verify
345
* that the expected writes, and only the expected writes,
346
* have taken place at the end of each sync point.
347
*
348
* The sync_point_done callback allows t_cache to perform
349
* this verification. The field is set to NULL when the
350
* callback is not needed.
351
*
352
* The following field supports the metadata cache image feature.
353
*
354
* p0_image_len: unsiged integer containing the length of the metadata cache
355
* image constructed by MPI process 0. This field should be 0
356
* if the value is unknown, or if cache image is not enabled.
357
*
358
****************************************************************************/
359
360
#ifdef H5_HAVE_PARALLEL
361
362
#define H5AC__H5AC_AUX_T_MAGIC (unsigned)0x00D0A01
363
364
typedef
struct
H5AC_aux_t
365
{
366
uint32_t
magic;
367
368
MPI_Comm mpi_comm;
369
370
int
mpi_rank;
371
372
int
mpi_size;
373
374
hbool_t
write_permitted;
375
376
size_t
dirty_bytes_threshold;
377
378
size_t
dirty_bytes;
379
380
int32_t metadata_write_strategy;
381
382
#if H5AC_DEBUG_DIRTY_BYTES_CREATION
383
384
unsigned
dirty_bytes_propagations;
385
386
size_t
unprotect_dirty_bytes;
387
unsigned
unprotect_dirty_bytes_updates;
388
389
size_t
insert_dirty_bytes;
390
unsigned
insert_dirty_bytes_updates;
391
392
size_t
move_dirty_bytes;
393
unsigned
move_dirty_bytes_updates;
394
395
#endif
/* H5AC_DEBUG_DIRTY_BYTES_CREATION */
396
397
H5SL_t
* d_slist_ptr;
398
399
H5SL_t
* c_slist_ptr;
400
401
H5SL_t
* candidate_slist_ptr;
402
403
void (* write_done)(void);
404
405
void (* sync_point_done)(
unsigned
num_writes,
406
haddr_t
* written_entries_tbl);
407
408
unsigned
p0_image_len;
409
410
} H5AC_aux_t;
/* struct H5AC_aux_t */
411
#endif
/* H5_HAVE_PARALLEL */
412
413
414
/******************************/
415
/* Package Private Prototypes */
416
/******************************/
417
418
#ifdef H5_HAVE_PARALLEL
419
/* Parallel I/O routines */
420
H5_DLL
herr_t
H5AC__log_deleted_entry(
const
H5AC_info_t
*entry_ptr);
421
H5_DLL
herr_t
H5AC__log_dirtied_entry(
const
H5AC_info_t
*entry_ptr);
422
H5_DLL
herr_t
H5AC__log_cleaned_entry(
const
H5AC_info_t
*entry_ptr);
423
H5_DLL
herr_t
H5AC__log_flushed_entry(
H5C_t
*cache_ptr,
haddr_t
addr,
424
hbool_t
was_dirty,
unsigned
flags);
425
H5_DLL
herr_t
H5AC__log_inserted_entry(
const
H5AC_info_t
*entry_ptr);
426
H5_DLL
herr_t
H5AC__log_moved_entry(
const
H5F_t
*
f
,
haddr_t
old_addr,
427
haddr_t
new_addr);
428
H5_DLL
herr_t
H5AC__flush_entries(
H5F_t
*
f
);
429
H5_DLL
herr_t
H5AC__run_sync_point(
H5F_t
*
f
,
int
sync_point_op);
430
H5_DLL
herr_t
H5AC__set_sync_point_done_callback(
H5C_t
*cache_ptr,
431
void
(*sync_point_done)(
unsigned
num_writes,
haddr_t
*written_entries_tbl));
432
H5_DLL
herr_t
H5AC__set_write_done_callback(
H5C_t
* cache_ptr,
433
void
(* write_done)(
void
));
434
#endif
/* H5_HAVE_PARALLEL */
435
436
#endif
/* _H5ACpkg_H */
437
H5SL_t
Definition:
H5SL.c:557
f
hdr f
Definition:
H5EA.c:755
H5C_t
Definition:
H5Cpkg.h:4642
H5FL_EXTERN
H5FL_EXTERN(H5AC_aux_t)
uint32_t
uint32_t
Definition:
H5overflow.txt:38
haddr_t
CATCH haddr_t
Definition:
H5EAdblock.c:162
H5ACprivate.h
H5Cprivate.h
H5_DLL
#define H5_DLL
Definition:
H5api_adpt.h:234
H5F_t
Definition:
H5Fpkg.h:374
H5FLprivate.h
herr_t
int herr_t
Definition:
H5public.h:128
hbool_t
bool hbool_t
Definition:
H5public.h:159
H5C_cache_entry_t
Definition:
H5Cprivate.h:1597
Generated by
1.8.18