[Darshan-commits] [Darshan] branch, dev-modular, updated. 018122117ee9f50abaa2b5fd73b7d3133c09373c

Service Account git at mcs.anl.gov
Wed Feb 11 13:10:49 CST 2015


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "".

The branch, dev-modular has been updated
       via  018122117ee9f50abaa2b5fd73b7d3133c09373c (commit)
      from  5348600bd92eba9cf3d8935512b523a5232838f7 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit 018122117ee9f50abaa2b5fd73b7d3133c09373c
Author: Shane Snyder <ssnyder at mcs.anl.gov>
Date:   Wed Feb 11 13:10:25 2015 -0600

    revamped error handling runtime side

-----------------------------------------------------------------------

Summary of changes:
 darshan-log-format.h                |   15 ++-
 darshan-runtime/darshan-core.h      |    1 -
 darshan-runtime/darshan.h           |    1 -
 darshan-runtime/lib/darshan-core.c  |  236 ++++++++++++++++++----------------
 darshan-runtime/lib/darshan-posix.c |    3 -
 5 files changed, 135 insertions(+), 121 deletions(-)


Diff of changes:
diff --git a/darshan-log-format.h b/darshan-log-format.h
index f1d2c49..6f7d63d 100644
--- a/darshan-log-format.h
+++ b/darshan-log-format.h
@@ -30,25 +30,30 @@
 /* max length of exe string within job record (not counting '\0') */
 #define CP_EXE_LEN (CP_JOB_RECORD_SIZE - sizeof(struct darshan_job) - 1)
 
-/* max length of module name string (not counting '\0') */
-/* TODO */
-#define DARSHAN_MOD_NAME_LEN 31
-
 typedef uint64_t darshan_record_id;
 
 /* unique identifiers to distinguish between available darshan modules */
 /* NOTES: - valid ids range from [0...DARSHAN_MAX_MODS-1]
  *        - order of ids control module shutdown order (and consequently, order in log file)
  */
+/* TODO: enforce maximum? */
 #define DARSHAN_MAX_MODS 16
 typedef enum
 {
-    DARSHAN_POSIX_MOD,
+    DARSHAN_POSIX_MOD = 0,
     DARSHAN_MPIIO_MOD,
     DARSHAN_HDF5_MOD,
     DARSHAN_PNETCDF_MOD,
 } darshan_module_id;
 
+static char *darshan_module_names[] =
+{
+    "POSIX",
+    "MPI-IO",
+    "HDF5",
+    "PNETCDF"
+};
+
 enum darshan_comp_type
 {
     DARSHAN_GZ_COMP,
diff --git a/darshan-runtime/darshan-core.h b/darshan-runtime/darshan-core.h
index 835bb42..91f68b5 100644
--- a/darshan-runtime/darshan-core.h
+++ b/darshan-runtime/darshan-core.h
@@ -18,7 +18,6 @@
 struct darshan_core_module
 {
     darshan_module_id id;
-    char name[DARSHAN_MOD_NAME_LEN+1];
     struct darshan_module_funcs mod_funcs;
 };
 
diff --git a/darshan-runtime/darshan.h b/darshan-runtime/darshan.h
index 87c6837..7b57874 100644
--- a/darshan-runtime/darshan.h
+++ b/darshan-runtime/darshan.h
@@ -44,7 +44,6 @@ struct darshan_module_funcs
 
 void darshan_core_register_module(
     darshan_module_id id,
-    char *name,
     struct darshan_module_funcs *funcs,
     int *runtime_mem_limit);
 
diff --git a/darshan-runtime/lib/darshan-core.c b/darshan-runtime/lib/darshan-core.c
index f33fded..ef03e4b 100644
--- a/darshan-runtime/lib/darshan-core.c
+++ b/darshan-runtime/lib/darshan-core.c
@@ -45,9 +45,11 @@ static void darshan_get_logfile_name(
     char* logfile_name, int jobid, struct tm* start_tm);
 static void darshan_log_record_hints_and_ver(
     struct darshan_core_runtime* job);
-static int darshan_get_shared_record_ids(
+static void darshan_get_shared_record_ids(
     struct darshan_core_runtime *job, darshan_record_id *shared_recs);
-static int darshan_log_write_record_map(
+static int darshan_log_coll_open(
+    char *logfile_name, MPI_File *log_fh);
+static int darshan_log_write_record_hash(
     MPI_File log_fh, struct darshan_core_record_ref *rec_hash,
     darshan_record_id *shared_recs, struct darshan_log_map *map);
 static int darshan_log_coll_write(
@@ -197,13 +199,11 @@ static void darshan_core_shutdown()
     int local_mod_use[DARSHAN_MAX_MODS] = {0};
     int global_mod_use_count[DARSHAN_MAX_MODS] = {0};
     darshan_record_id shared_recs[DARSHAN_CORE_MAX_RECORDS] = {0};
-    char *hints;
     double start_log_time;
     long offset;
     struct darshan_header log_header;
     MPI_File log_fh;
     MPI_Offset tmp_off;
-    MPI_Info info;
     MPI_Status status;
 
     if(getenv("DARSHAN_INTERNAL_TIMING"))
@@ -297,10 +297,6 @@ static void darshan_core_shutdown()
         final_job->log_job.end_time = last_end_time;
     }
 
-    /* XXX */
-    /* TODO: ensuing error checking...does MPI ensure collective I/O functions return the same error
-     * globally, or do I always need to allreduce????? */
-
     /* set which local modules were actually used */
     for(i = 0; i < DARSHAN_MAX_MODS; i++)
     {
@@ -312,72 +308,10 @@ static void darshan_core_shutdown()
     DARSHAN_MPI_CALL(PMPI_Allreduce)(local_mod_use, global_mod_use_count, DARSHAN_MAX_MODS, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
 
     /* get a list of records which are shared across all processes */
-    ret = darshan_get_shared_record_ids(final_job, shared_recs);
-
-    /* error out if unable to determine shared file records */
-    DARSHAN_MPI_CALL(PMPI_Allreduce)(&ret, &all_ret, 1, MPI_INT,
-        MPI_LOR, MPI_COMM_WORLD);
-    if(all_ret != 0)
-    {
-        if(my_rank == 0)
-        {
-            fprintf(stderr, "darshan library warning: unable to determine shared file records\n");
-        }
-        free(logfile_name);
-        darshan_core_cleanup(final_job);
-        return;
-    }
-
-    /* check environment variable to see if the default MPI file hints have
-     * been overridden
-     */
-    MPI_Info_create(&info);
-
-    hints = getenv(CP_LOG_HINTS_OVERRIDE);
-    if(!hints)
-    {
-        hints = __CP_LOG_HINTS;
-    }
-
-    if(hints && strlen(hints) > 0)
-    {
-        char *tok_str;
-        char *orig_tok_str;
-        char *key;
-        char *value;
-        char *saveptr = NULL;
-
-        tok_str = strdup(hints);
-        if(tok_str)
-        {
-            orig_tok_str = tok_str;
-            do
-            {
-                /* split string on semicolon */
-                key = strtok_r(tok_str, ";", &saveptr);
-                if(key)
-                {
-                    tok_str = NULL;
-                    /* look for = sign splitting key/value pairs */
-                    value = index(key, '=');
-                    if(value)
-                    {
-                        /* break key and value into separate null terminated strings */
-                        value[0] = '\0';
-                        value++;
-                        if(strlen(key) > 0)
-                            MPI_Info_set(info, key, value);
-                    }
-                }
-            }while(key != NULL);
-            free(orig_tok_str);
-        }
-    }
+    darshan_get_shared_record_ids(final_job, shared_recs);
 
-    /* open the darshan log file for writing */
-    ret = DARSHAN_MPI_CALL(PMPI_File_open)(MPI_COMM_WORLD, logfile_name,
-        MPI_MODE_CREATE | MPI_MODE_WRONLY | MPI_MODE_EXCL, info, &log_fh);
-    MPI_Info_free(&info);
+    /* collectively open the darshan log file */
+    ret = darshan_log_coll_open(logfile_name, &log_fh);
 
     /* error out if unable to open log file */
     DARSHAN_MPI_CALL(PMPI_Allreduce)(&ret, &all_ret, 1, MPI_INT,
@@ -386,12 +320,8 @@ static void darshan_core_shutdown()
     {
         if(my_rank == 0)
         {
-            int msg_len;
-            char msg[MPI_MAX_ERROR_STRING] = {0};
-
-            MPI_Error_string(ret, msg, &msg_len);
-            fprintf(stderr, "darshan library warning: unable to open log file %s: %s\n",
-                logfile_name, msg);
+            fprintf(stderr, "darshan library warning: unable to open log file %s\n",
+                logfile_name);
             unlink(logfile_name);
         }
         free(logfile_name);
@@ -403,26 +333,33 @@ static void darshan_core_shutdown()
     if(my_rank == 0)
     {
         /* write the job information, making sure to prealloc space for the log header */
-        ret = DARSHAN_MPI_CALL(PMPI_File_write_at)(log_fh, sizeof(struct darshan_header),
+        all_ret = DARSHAN_MPI_CALL(PMPI_File_write_at)(log_fh, sizeof(struct darshan_header),
                 &final_job->log_job, sizeof(struct darshan_job), MPI_BYTE, &status);
-        if(ret != MPI_SUCCESS)
+        if(all_ret != MPI_SUCCESS)
         {
-            int msg_len;
-            char msg[MPI_MAX_ERROR_STRING] = {0};
-
-            MPI_Error_string(ret, msg, &msg_len);
-            fprintf(stderr, "darshan library warning: unable to write job data to log file %s: %s\n",
-                    logfile_name, msg);
+            fprintf(stderr, "darshan library warning: unable to write job data to log file %s\n",
+                    logfile_name);
+            unlink(logfile_name);
         }
 
-        /* TODO */
+        /* TODO: after compression is added, this should be fixed */
         log_header.rec_map.off = sizeof(struct darshan_header) + sizeof(struct darshan_job);
     }
 
-    /* write the record name->id map to the log file */
-    ret = darshan_log_write_record_map(log_fh, final_job->rec_hash,
+    /* error out if unable to write job information */
+    DARSHAN_MPI_CALL(PMPI_Bcast)(&all_ret, 1, MPI_INT, 0, MPI_COMM_WORLD);
+    if(all_ret != 0)
+    {
+        free(logfile_name);
+        darshan_core_cleanup(final_job);
+        return;
+    }
+
+    /* write the record name->id hash to the log file */
+    ret = darshan_log_write_record_hash(log_fh, final_job->rec_hash,
         shared_recs, &log_header.rec_map);
 
+    /* error out if unable to write record hash */
     DARSHAN_MPI_CALL(PMPI_Allreduce)(&ret, &all_ret, 1, MPI_INT,
         MPI_LOR, MPI_COMM_WORLD);
     if(all_ret != 0)
@@ -431,6 +368,7 @@ static void darshan_core_shutdown()
         {
             fprintf(stderr, "darshan library warning: unable to write record map to log file %s\n",
                 logfile_name);
+            unlink(logfile_name);
         }
         free(logfile_name);
         darshan_core_cleanup(final_job);
@@ -441,7 +379,8 @@ static void darshan_core_shutdown()
      *      - get final output buffer
      *      - compress (zlib) provided output buffer
      *      - append compressed buffer to log file
-     *      - shutdown the module TODO
+     *      - add module index info (file offset/length) to log header
+     *      - shutdown the module
      */
     for(i = 0; i < DARSHAN_MAX_MODS; i++)
     {
@@ -483,9 +422,22 @@ static void darshan_core_shutdown()
 
         /* write module data buffer to the darshan log file */
         ret = darshan_log_coll_write(log_fh, mod_buf, mod_buf_size, &log_header.mod_map[i]);
-        if(ret < 0)
+
+        /* error out if unable to write this module's data */
+        DARSHAN_MPI_CALL(PMPI_Allreduce)(&ret, &all_ret, 1, MPI_INT,
+            MPI_LOR, MPI_COMM_WORLD);
+        if(all_ret != 0)
         {
-            /* TODO: */
+            if(my_rank == 0)
+            {
+                fprintf(stderr,
+                    "darshan library warning: unable to write %s module data to log file %s\n",
+                    darshan_module_names[i], logfile_name);
+                unlink(logfile_name);
+            }
+            free(logfile_name);
+            darshan_core_cleanup(final_job);
+            return;
         }
 
         tmp_off += log_header.mod_map[i].len;
@@ -508,20 +460,32 @@ static void darshan_core_shutdown()
         log_header.magic_nr = CP_MAGIC_NR;
         log_header.comp_type = DARSHAN_GZ_COMP;
 
-        ret = DARSHAN_MPI_CALL(PMPI_File_write_at)(log_fh, 0, &log_header,
+        all_ret = DARSHAN_MPI_CALL(PMPI_File_write_at)(log_fh, 0, &log_header,
             sizeof(struct darshan_header), MPI_BYTE, &status);
-        if(ret != MPI_SUCCESS)
+        if(all_ret != MPI_SUCCESS)
         {
-            /* TODO */
+            fprintf(stderr, "darshan library warning: unable to write header to log file %s\n",
+                    logfile_name);
+            unlink(logfile_name);
         }
     }
 
+    /* error out if unable to write log header */
+    DARSHAN_MPI_CALL(PMPI_Bcast)(&all_ret, 1, MPI_INT, 0, MPI_COMM_WORLD);
+    if(all_ret != 0)
+    {
+        free(logfile_name);
+        darshan_core_cleanup(final_job);
+        return;
+    }
+
     DARSHAN_MPI_CALL(PMPI_File_close)(&log_fh);
 
     /* if we got this far, there are no errors, so rename from *.darshan_partial
      * to *-<logwritetime>.darshan.gz, which indicates that this log file is
      * complete and ready for analysis
      */
+    /* TODO: support user given logfile path/name */
     if(my_rank == 0)
     {
         char* tmp_index;
@@ -758,12 +722,11 @@ static void darshan_log_record_hints_and_ver(struct darshan_core_runtime* job)
     return;
 }
 
-static int darshan_get_shared_record_ids(struct darshan_core_runtime *job,
+static void darshan_get_shared_record_ids(struct darshan_core_runtime *job,
     darshan_record_id *shared_recs)
 {
     int i;
     int ndx;
-    int ret;
     struct darshan_core_record_ref *ref, *tmp;
     darshan_record_id id_array[DARSHAN_CORE_MAX_RECORDS] = {0};
     darshan_record_id mask_array[DARSHAN_CORE_MAX_RECORDS] = {0};
@@ -780,13 +743,9 @@ static int darshan_get_shared_record_ids(struct darshan_core_runtime *job,
     }
 
     /* broadcast root's list of records to all other processes */
-    ret = DARSHAN_MPI_CALL(PMPI_Bcast)(id_array,
+    DARSHAN_MPI_CALL(PMPI_Bcast)(id_array,
         (DARSHAN_CORE_MAX_RECORDS * sizeof(darshan_record_id)),
         MPI_BYTE, 0, MPI_COMM_WORLD);
-    if(ret != 0)
-    {
-        return(-1);
-    }
 
     /* everyone looks to see if they opened the same records as root */
     for(i=0; (i<DARSHAN_CORE_MAX_RECORDS && id_array[i] != 0); i++)
@@ -803,12 +762,8 @@ static int darshan_get_shared_record_ids(struct darshan_core_runtime *job,
     }
 
     /* now allreduce so everyone agrees which files are shared */
-    ret = DARSHAN_MPI_CALL(PMPI_Allreduce)(mask_array, all_mask_array,
+    DARSHAN_MPI_CALL(PMPI_Allreduce)(mask_array, all_mask_array,
         DARSHAN_CORE_MAX_RECORDS, MPI_INT, MPI_LAND, MPI_COMM_WORLD);
-    if(ret != 0)
-    {
-        return(-1);
-    }
 
     ndx = 0;
     for(i=0; (i<DARSHAN_CORE_MAX_RECORDS && id_array[i] != 0); i++)
@@ -819,13 +774,74 @@ static int darshan_get_shared_record_ids(struct darshan_core_runtime *job,
         }
     }
 
+    return;
+}
+
+static int darshan_log_coll_open(char *logfile_name, MPI_File *log_fh)
+{
+    char *hints;
+    char *tok_str;
+    char *orig_tok_str;
+    char *key;
+    char *value;
+    char *saveptr = NULL;
+    int ret;
+    MPI_Info info;
+
+    /* check environment variable to see if the default MPI file hints have
+     * been overridden
+     */
+    MPI_Info_create(&info);
+
+    hints = getenv(CP_LOG_HINTS_OVERRIDE);
+    if(!hints)
+    {
+        hints = __CP_LOG_HINTS;
+    }
+
+    if(hints && strlen(hints) > 0)
+    {
+        tok_str = strdup(hints);
+        if(tok_str)
+        {
+            orig_tok_str = tok_str;
+            do
+            {
+                /* split string on semicolon */
+                key = strtok_r(tok_str, ";", &saveptr);
+                if(key)
+                {
+                    tok_str = NULL;
+                    /* look for = sign splitting key/value pairs */
+                    value = index(key, '=');
+                    if(value)
+                    {
+                        /* break key and value into separate null terminated strings */
+                        value[0] = '\0';
+                        value++;
+                        if(strlen(key) > 0)
+                            MPI_Info_set(info, key, value);
+                    }
+                }
+            }while(key != NULL);
+            free(orig_tok_str);
+        }
+    }
+
+    /* open the darshan log file for writing */
+    ret = DARSHAN_MPI_CALL(PMPI_File_open)(MPI_COMM_WORLD, logfile_name,
+        MPI_MODE_CREATE | MPI_MODE_WRONLY | MPI_MODE_EXCL, info, log_fh);
+    if(ret < 0)
+        return(-1);
+
+    MPI_Info_free(&info);
     return(0);
 }
 
 /* NOTE: the map written to file may contain duplicate id->name entries if a
  *       record is opened by multiple ranks, but not all ranks
  */
-static int darshan_log_write_record_map(MPI_File log_fh, struct darshan_core_record_ref *rec_hash,
+static int darshan_log_write_record_hash(MPI_File log_fh, struct darshan_core_record_ref *rec_hash,
     darshan_record_id *shared_recs, struct darshan_log_map *map)
 {
     int i;
@@ -983,7 +999,6 @@ static int darshan_log_coll_write(MPI_File log_fh, void *buf, int count,
 
 void darshan_core_register_module(
     darshan_module_id id,
-    char *name,
     struct darshan_module_funcs *funcs,
     int *runtime_mem_limit)
 {
@@ -1017,7 +1032,6 @@ void darshan_core_register_module(
     memset(mod, 0, sizeof(*mod));
 
     mod->id = id;
-    strncpy(mod->name, name, DARSHAN_MOD_NAME_LEN);
     mod->mod_funcs = *funcs;
 
     /* register module with darshan */
diff --git a/darshan-runtime/lib/darshan-posix.c b/darshan-runtime/lib/darshan-posix.c
index 269b8ab..1578b92 100644
--- a/darshan-runtime/lib/darshan-posix.c
+++ b/darshan-runtime/lib/darshan-posix.c
@@ -45,8 +45,6 @@ typedef int64_t off64_t;
 
 #define MAP_OR_FAIL(func)
 
-#define POSIX_MOD_NAME "POSIX"
-
 struct posix_runtime_file
 {
     struct darshan_posix_file* file_record;
@@ -245,7 +243,6 @@ static void posix_runtime_initialize()
     /* register the posix module with darshan core */
     darshan_core_register_module(
         DARSHAN_POSIX_MOD,
-        POSIX_MOD_NAME,
         &posix_mod_fns,
         &mem_limit);
 


hooks/post-receive
--



More information about the Darshan-commits mailing list