[Darshan-commits] [Git][darshan/darshan][lustre-mod] 3 commits: fixed a bug preventing shared file reduction from working correctly
Glenn K. Lockwood
xgitlab at cels.anl.gov
Wed Jun 15 16:26:03 CDT 2016
Glenn K. Lockwood pushed to branch lustre-mod at darshan / darshan
Commits:
4264247f by Glenn K. Lockwood at 2016-06-15T13:42:28-07:00
fixed a bug preventing shared file reduction from working correctly
- - - - -
735fc530 by Glenn K. Lockwood at 2016-06-15T14:18:37-07:00
moved the temporary record buffer used in darshan-{convert,parser} into the heap; darshan_diff remains on the stack
- - - - -
b51c463b by Glenn K. Lockwood at 2016-06-15T14:19:47-07:00
working set of tools to parse darshan lustre-mod logfiles
- - - - -
7 changed files:
- darshan-lustre-log-format.h
- darshan-runtime/lib/darshan-lustre.c
- darshan-util/darshan-convert.c
- darshan-util/darshan-diff.c
- darshan-util/darshan-logutils.h
- darshan-util/darshan-lustre-logutils.c
- darshan-util/darshan-parser.c
Changes:
=====================================
darshan-lustre-log-format.h
=====================================
--- a/darshan-lustre-log-format.h
+++ b/darshan-lustre-log-format.h
@@ -7,10 +7,14 @@
#ifndef __DARSHAN_LUSTRE_LOG_FORMAT_H
#define __DARSHAN_LUSTRE_LOG_FORMAT_H
+/* NOTE -- redefining the size of OST_ID will require changing the DARSHAN_BSWAP
+ * macro used in darshan-util/darshan-lustre-logutils.c as well
+ */
+typedef int64_t OST_ID;
+
/* current Lustre log format version */
#define DARSHAN_LUSTRE_VER 1
-/* TODO: add integer counters here (e.g., counter for stripe width, stripe size, etc etc) */
#define LUSTRE_COUNTERS \
/* number of OSTs for file system */\
X(LUSTRE_OSTS) \
@@ -44,7 +48,12 @@ struct darshan_lustre_record
darshan_record_id rec_id;
int64_t rank;
int64_t counters[LUSTRE_NUM_INDICES];
- int64_t ost_ids[1];
+ OST_ID ost_ids[1];
};
+/*
+ * helper function to calculate the size of a record
+ */
+#define LUSTRE_RECORD_SIZE( osts ) ( sizeof(struct darshan_lustre_record) + sizeof(OST_ID) * (osts - 1) )
+
#endif /* __DARSHAN_LUSTRE_LOG_FORMAT_H */
=====================================
darshan-runtime/lib/darshan-lustre.c
=====================================
--- a/darshan-runtime/lib/darshan-lustre.c
+++ b/darshan-runtime/lib/darshan-lustre.c
@@ -44,7 +44,6 @@ static void lustre_record_reduction_op(void* infile_v, void* inoutfile_v,
#define LUSTRE_LOCK() pthread_mutex_lock(&lustre_runtime_mutex)
#define LUSTRE_UNLOCK() pthread_mutex_unlock(&lustre_runtime_mutex)
-#define LUSTRE_RECORD_SIZE( osts ) ( sizeof(struct darshan_lustre_record) + sizeof(int64_t) * (osts - 1) )
void darshan_instrument_lustre_file(const char* filepath, int fd)
{
@@ -263,9 +262,17 @@ static void lustre_get_output_data(
*/
sort_lustre_records();
- /* allocate memory for the reduction output on rank 0 */
+ /* simply drop all shared records from non-root ranks by truncating
+ * the record array and recalculating the size of the used buffer
+ */
if (my_rank != 0)
+ {
lustre_runtime->record_count -= shared_rec_count;
+ lustre_runtime->record_buffer_used = 0;
+ for ( i = 0; i < lustre_runtime->record_count; i++ )
+ lustre_runtime->record_buffer_used +=
+ LUSTRE_RECORD_SIZE( (lustre_runtime->record_runtime_array[i]).record->counters[LUSTRE_STRIPE_WIDTH] );
+ }
}
*lustre_buf = (void *)(lustre_runtime->record_buffer);
=====================================
darshan-util/darshan-convert.c
=====================================
--- a/darshan-util/darshan-convert.c
+++ b/darshan-util/darshan-convert.c
@@ -19,8 +19,6 @@
#include "darshan-logutils.h"
-#define DEF_MOD_BUF_SIZE 1024 /* 1 KiB is enough for all current mod records ... */
-
extern uint32_t darshan_hashlittle(const void *key, size_t length, uint32_t initval);
int usage (char *exename)
@@ -233,7 +231,7 @@ int main(int argc, char **argv)
struct darshan_mnt_info *mnt_data_array;
struct darshan_record_ref *rec_hash = NULL;
struct darshan_record_ref *ref, *tmp;
- char mod_buf[DEF_MOD_BUF_SIZE];
+ char *mod_buf;
enum darshan_comp_type comp_type;
int bzip2;
int obfuscate;
@@ -338,6 +336,14 @@ int main(int argc, char **argv)
return(-1);
}
+ mod_buf = malloc(DEF_MOD_BUF_SIZE);
+ if (!mod_buf)
+ {
+ darshan_log_close(infile);
+ darshan_log_close(outfile);
+ return(-1);
+ }
+
/* loop over each module and convert it's data to the new format */
for(i=0; i<DARSHAN_MAX_MODS; i++)
{
@@ -364,6 +370,7 @@ int main(int argc, char **argv)
darshan_log_close(infile);
darshan_log_close(outfile);
unlink(outfile_name);
+ free(mod_buf);
return(-1);
}
@@ -384,6 +391,7 @@ int main(int argc, char **argv)
}
} while((ret = mod_logutils[i]->log_get_record(infile, mod_buf, &rec_id)) == 1);
}
+ free(mod_buf);
darshan_log_close(infile);
darshan_log_close(outfile);
=====================================
darshan-util/darshan-diff.c
=====================================
--- a/darshan-util/darshan-diff.c
+++ b/darshan-util/darshan-diff.c
@@ -13,8 +13,6 @@
#include "darshan-logutils.h"
#include "uthash-1.9.2/src/uthash.h"
-#define DEF_MOD_BUF_SIZE 1024 /* 1 KiB is enough for all current mod records ... */
-
/* XXX: this structure is a temporary hack to get at the rank for each module's record */
struct darshan_base_rec
{
=====================================
darshan-util/darshan-logutils.h
=====================================
--- a/darshan-util/darshan-logutils.h
+++ b/darshan-util/darshan-logutils.h
@@ -17,6 +17,11 @@
#include "darshan-log-format.h"
+/* Maximum size of a record - Lustre OST lists can get huge, but 81920 is enough
+ * for 10K OSTs
+ */
+#define DEF_MOD_BUF_SIZE 81920 /* 640 KiB */
+
struct darshan_fd_int_state;
/* darshan file descriptor definition */
=====================================
darshan-util/darshan-lustre-logutils.c
=====================================
--- a/darshan-util/darshan-lustre-logutils.c
+++ b/darshan-util/darshan-lustre-logutils.c
@@ -49,27 +49,45 @@ static int darshan_log_get_lustre_record(darshan_fd fd, void* lustre_buf,
int i;
int ret;
+ /* retrieve the fixed-size portion of the record */
ret = darshan_log_getmod(fd, DARSHAN_LUSTRE_MOD, lustre_buf,
sizeof(struct darshan_lustre_record));
if(ret < 0)
return(-1);
else if(ret < sizeof(struct darshan_lustre_record))
return(0);
- else
+
+ rec = (struct darshan_lustre_record *)lustre_buf;
+
+ /* swap bytes if necessary */
+ if(fd->swap_flag)
{
- rec = (struct darshan_lustre_record *)lustre_buf;
- if(fd->swap_flag)
- {
- /* swap bytes if necessary */
- DARSHAN_BSWAP64(&rec->rec_id);
- DARSHAN_BSWAP64(&rec->rank);
- for(i=0; i<LUSTRE_NUM_INDICES; i++)
- DARSHAN_BSWAP64(&rec->counters[i]);
- }
+ DARSHAN_BSWAP64(&rec->rec_id);
+ DARSHAN_BSWAP64(&rec->rank);
+ for(i=0; i<LUSTRE_NUM_INDICES; i++)
+ DARSHAN_BSWAP64(&rec->counters[i]);
+ }
- *rec_id = rec->rec_id;
- return(1);
+ /* now read the rest of the record */
+ if ( rec->counters[LUSTRE_STRIPE_WIDTH] > 1 ) {
+ ret = darshan_log_getmod(
+ fd,
+ DARSHAN_LUSTRE_MOD,
+ (void*)(&(rec->ost_ids[1])),
+ (rec->counters[LUSTRE_STRIPE_WIDTH] - 1)*sizeof(OST_ID)
+ );
+ if(ret < 0)
+ return(-1);
+ else if(ret < sizeof(struct darshan_lustre_record))
+ return(0);
+ /* swap bytes if necessary */
+ if ( fd->swap_flag )
+ for (i = 1; i < rec->counters[LUSTRE_STRIPE_WIDTH]; i++ )
+ DARSHAN_BSWAP64(&(rec->ost_ids[i]));
}
+
+ *rec_id = rec->rec_id;
+ return(1);
}
static int darshan_log_put_lustre_record(darshan_fd fd, void* lustre_buf, int ver)
@@ -78,7 +96,7 @@ static int darshan_log_put_lustre_record(darshan_fd fd, void* lustre_buf, int ve
int ret;
ret = darshan_log_putmod(fd, DARSHAN_LUSTRE_MOD, rec,
- sizeof(struct darshan_lustre_record), ver);
+ LUSTRE_RECORD_SIZE(rec->counters[LUSTRE_STRIPE_WIDTH]), ver);
if(ret < 0)
return(-1);
@@ -99,6 +117,20 @@ static void darshan_log_print_lustre_record(void *rec, char *file_name,
lustre_rec->counters[i], file_name, mnt_pt, fs_type);
}
+ for (i = 0; i < lustre_rec->counters[LUSTRE_STRIPE_WIDTH]; i++ )
+ {
+ char strbuf[25];
+ snprintf( strbuf, 25, "LUSTRE_OST_ID_%d", i );
+ DARSHAN_COUNTER_PRINT(darshan_module_names[DARSHAN_LUSTRE_MOD],
+ lustre_rec->rank,
+ lustre_rec->rec_id,
+ strbuf,
+ lustre_rec->ost_ids[i],
+ file_name,
+ mnt_pt,
+ fs_type);
+ }
+
return;
}
@@ -110,7 +142,8 @@ static void darshan_log_print_lustre_description()
printf("# LUSTRE_MDTS: number of MDTs across the entire file system.\n");
printf("# LUSTRE_STRIPE_SIZE: stripe size for file in bytes.\n");
printf("# LUSTRE_STRIPE_WIDTH: number of OSTs over which file is striped.\n");
- printf("# LUSTRE_STRIPE_OFFSET: OBD index of the file's first stripe.\n");
+ printf("# LUSTRE_STRIPE_OFFSET: OST ID offset specified when the file was created.\n");
+ printf("# LUSTRE_OST_ID_*: indices of OSTs over which the file is striped.\n");
DARSHAN_PRINT_HEADER();
@@ -156,6 +189,36 @@ static void darshan_log_print_lustre_record_diff(void *rec1, char *file_name1,
}
}
+ /* would it be more or less useful to sort the OST IDs before comparing? */
+ if ( lustre_rec1->counters[LUSTRE_STRIPE_WIDTH] == lustre_rec2->counters[LUSTRE_STRIPE_WIDTH] ) {
+ for (i = 0; i < lustre_rec1->counters[LUSTRE_STRIPE_WIDTH]; i++ )
+ {
+ if (lustre_rec1->ost_ids[i] != lustre_rec2->ost_ids[i])
+ {
+ char strbuf[25];
+ snprintf( strbuf, 25, "LUSTRE_OST_ID_%d", i );
+ printf("- ");
+ DARSHAN_COUNTER_PRINT(darshan_module_names[DARSHAN_LUSTRE_MOD],
+ lustre_rec1->rank,
+ lustre_rec1->rec_id,
+ strbuf,
+ lustre_rec1->ost_ids[i],
+ file_name1,
+ "",
+ "");
+ printf("+ ");
+ DARSHAN_COUNTER_PRINT(darshan_module_names[DARSHAN_LUSTRE_MOD],
+ lustre_rec2->rank,
+ lustre_rec2->rec_id,
+ strbuf,
+ lustre_rec2->ost_ids[i],
+ file_name2,
+ "",
+ "");
+ }
+ }
+ }
+
return;
}
=====================================
darshan-util/darshan-parser.c
=====================================
--- a/darshan-util/darshan-parser.c
+++ b/darshan-util/darshan-parser.c
@@ -20,8 +20,6 @@
#include "darshan-logutils.h"
-#define DEF_MOD_BUF_SIZE 1024 /* 1 KiB is enough for all current mod records ... */
-
/*
* Options
*/
@@ -211,7 +209,7 @@ int main(int argc, char **argv)
char *save;
char buffer[DARSHAN_JOB_METADATA_LEN];
int empty_mods = 0;
- char mod_buf[DEF_MOD_BUF_SIZE];
+ char *mod_buf;
hash_entry_t *file_hash = NULL;
hash_entry_t *curr = NULL;
@@ -350,6 +348,12 @@ int main(int argc, char **argv)
memset(pdata.rank_cumul_md_time, 0, sizeof(double)*job.nprocs);
}
+ mod_buf = malloc(DEF_MOD_BUF_SIZE);
+ if (!mod_buf) {
+ darshan_log_close(fd);
+ return(-1);
+ }
+
for(i=0; i<DARSHAN_MAX_MODS; i++)
{
darshan_record_id rec_id;
@@ -624,6 +628,7 @@ cleanup:
darshan_log_close(fd);
free(pdata.rank_cumul_io_time);
free(pdata.rank_cumul_md_time);
+ free(mod_buf);
/* free record hash data */
HASH_ITER(hlink, rec_hash, ref, tmp_ref)
View it on GitLab: https://xgitlab.cels.anl.gov/darshan/darshan/compare/fad7e81662c2b6c1c4a749c9299d6bb65537c022...b51c463b59485b4d3f2abd90f63285ff8d72d06f
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.mcs.anl.gov/pipermail/darshan-commits/attachments/20160615/454e9cf5/attachment-0001.html>
More information about the Darshan-commits
mailing list