[Darshan-commits] [Git][darshan/darshan][master] 2 commits: improve handling of incomplete logs

Shane Snyder xgitlab at cels.anl.gov
Tue Mar 16 14:05:53 CDT 2021



Shane Snyder pushed to branch master at darshan / darshan


Commits:
1bd5b2db by Philip Carns at 2021-03-16T14:05:47-05:00
improve handling of incomplete logs

- exit with error by default in parsers
- show more verbose error message with tips to help
- add --show-incomplete option to enable previous behavior
- properly set incomplete flag for modules that don't store any records

- - - - -
7d4a80ab by Shane Snyder at 2021-03-16T14:05:47-05:00
Merge branch 'carns/issue-284-incomplete-warning' into 'master'

improve handling of incomplete logs

Closes #284

See merge request darshan/darshan!87
- - - - -


14 changed files:

- ChangeLog
- darshan-runtime/lib/darshan-bgq.c
- darshan-runtime/lib/darshan-hdf5.c
- darshan-runtime/lib/darshan-lustre.c
- darshan-runtime/lib/darshan-mdhim.c
- darshan-runtime/lib/darshan-mpiio.c
- darshan-runtime/lib/darshan-null.c
- darshan-runtime/lib/darshan-pnetcdf.c
- darshan-runtime/lib/darshan-posix.c
- darshan-runtime/lib/darshan-stdio.c
- darshan-util/darshan-dxt-parser.c
- darshan-util/darshan-job-summary/bin/darshan-job-summary.pl.in
- darshan-util/darshan-job-summary/share/summary.tex
- darshan-util/darshan-parser.c


Changes:

=====================================
ChangeLog
=====================================
@@ -2,10 +2,12 @@
 Darshan Release Change Log
 --------------------------
 
-Darshan-3.2.2
+Darshan-3.3
 =============
 * add wrappers for preadv, preadv2, pwritev, and pwritev2 (improves profiling
   of ompio)
+* improve error handling in command line parsers to make incomplete logs more
+  obvious to users
 
 Darshan-3.2.1
 =============


=====================================
darshan-runtime/lib/darshan-bgq.c
=====================================
@@ -136,14 +136,6 @@ void bgq_runtime_initialize()
         &my_rank,
         NULL);
 
-    /* not enough memory to fit bgq module record */
-    if(bgq_buf_size < sizeof(struct darshan_bgq_record))
-    {
-        darshan_core_unregister_module(DARSHAN_BGQ_MOD);
-        BGQ_UNLOCK();
-        return;
-    }
-
     /* initialize module's global state */
     bgq_runtime = malloc(sizeof(*bgq_runtime));
     if(!bgq_runtime)


=====================================
darshan-runtime/lib/darshan-hdf5.c
=====================================
@@ -933,13 +933,6 @@ static void hdf5_file_runtime_initialize()
         &my_rank,
         NULL);
 
-    /* return if darshan-core does not provide enough module memory */
-    if(hdf5_buf_size < sizeof(struct darshan_hdf5_file))
-    {
-        darshan_core_unregister_module(DARSHAN_H5F_MOD);
-        return;
-    }
-
     hdf5_file_runtime = malloc(sizeof(*hdf5_file_runtime));
     if(!hdf5_file_runtime)
     {
@@ -972,13 +965,6 @@ static void hdf5_dataset_runtime_initialize()
         &my_rank,
         NULL);
 
-    /* return if darshan-core does not provide enough module memory */
-    if(hdf5_buf_size < sizeof(struct darshan_hdf5_dataset))
-    {
-        darshan_core_unregister_module(DARSHAN_H5D_MOD);
-        return;
-    }
-
     hdf5_dataset_runtime = malloc(sizeof(*hdf5_dataset_runtime));
     if(!hdf5_dataset_runtime)
     {


=====================================
darshan-runtime/lib/darshan-lustre.c
=====================================
@@ -203,15 +203,6 @@ static void lustre_runtime_initialize()
         &my_rank,
         NULL);
 
-    if(lustre_buf_size < LUSTRE_RECORD_SIZE(1))
-    {
-        /* unregister module if we aren't allocated enough space for
-         * the smallest possible record
-         */
-        darshan_core_unregister_module(DARSHAN_LUSTRE_MOD);
-        return;
-    }
-
     lustre_runtime = malloc(sizeof(*lustre_runtime));
     if(!lustre_runtime)
     {


=====================================
darshan-runtime/lib/darshan-mdhim.c
=====================================
@@ -325,14 +325,6 @@ static void mdhim_runtime_initialize()
         &my_rank,
         NULL);
 
-    /* return if darshan-core does not provide enough module memory for at 
-     * least one MDHIM record
-     */
-    if(mdhim_buf_size < sizeof(struct darshan_mdhim_record))
-    {
-        darshan_core_unregister_module(DARSHAN_MDHIM_MOD);
-        return;
-    }
 
     /* initialize module's global state */
     mdhim_runtime = calloc(1, sizeof(*mdhim_runtime));


=====================================
darshan-runtime/lib/darshan-mpiio.c
=====================================
@@ -1174,13 +1174,6 @@ static void mpiio_runtime_initialize()
         &my_rank,
         NULL);
 
-    /* return if darshan-core does not provide enough module memory */
-    if(mpiio_buf_size < sizeof(struct darshan_mpiio_file))
-    {
-        darshan_core_unregister_module(DARSHAN_MPIIO_MOD);
-        return;
-    }
-
     mpiio_runtime = malloc(sizeof(*mpiio_runtime));
     if(!mpiio_runtime)
     {


=====================================
darshan-runtime/lib/darshan-null.c
=====================================
@@ -224,15 +224,6 @@ static void null_runtime_initialize()
         &my_rank,
         NULL);
 
-    /* return if darshan-core does not provide enough module memory for at 
-     * least one NULL record
-     */
-    if(null_buf_size < sizeof(struct darshan_null_record))
-    {
-        darshan_core_unregister_module(DARSHAN_NULL_MOD);
-        return;
-    }
-
     /* initialize module's global state */
     null_runtime = malloc(sizeof(*null_runtime));
     if(!null_runtime)


=====================================
darshan-runtime/lib/darshan-pnetcdf.c
=====================================
@@ -231,13 +231,6 @@ static void pnetcdf_runtime_initialize()
         &my_rank,
         NULL);
 
-    /* return if darshan-core does not provide enough module memory */
-    if(pnetcdf_buf_size < sizeof(struct darshan_pnetcdf_file))
-    {
-        darshan_core_unregister_module(DARSHAN_PNETCDF_MOD);
-        return;
-    }
-
     pnetcdf_runtime = malloc(sizeof(*pnetcdf_runtime));
     if(!pnetcdf_runtime)
     {


=====================================
darshan-runtime/lib/darshan-posix.c
=====================================
@@ -1883,13 +1883,6 @@ static void posix_runtime_initialize()
         &my_rank,
         &darshan_mem_alignment);
 
-    /* return if darshan-core does not provide enough module memory */
-    if(psx_buf_size < sizeof(struct darshan_posix_file))
-    {
-        darshan_core_unregister_module(DARSHAN_POSIX_MOD);
-        return;
-    }
-
     posix_runtime = malloc(sizeof(*posix_runtime));
     if(!posix_runtime)
     {


=====================================
darshan-runtime/lib/darshan-stdio.c
=====================================
@@ -1018,13 +1018,6 @@ static void stdio_runtime_initialize()
         &my_rank,
         &darshan_mem_alignment);
 
-    /* return if darshan-core does not provide enough module memory */
-    if(stdio_buf_size < sizeof(struct darshan_stdio_file))
-    {
-        darshan_core_unregister_module(DARSHAN_STDIO_MOD);
-        return;
-    }
-
     stdio_runtime = malloc(sizeof(*stdio_runtime));
     if(!stdio_runtime)
     {


=====================================
darshan-util/darshan-dxt-parser.c
=====================================
@@ -20,15 +20,14 @@
 
 #include "darshan-logutils.h"
 
-int usage (char *exename)
-{
-    fprintf(stderr, "Usage: %s <filename>\n", exename);
+#define OPTION_SHOW_INCOMPLETE  (1 << 7)  /* show what we have, even if log is incomplete */
 
-    exit(1);
-}
+static int usage (char *exename);
+static int parse_args (int argc, char **argv, char **filename);
 
 int main(int argc, char **argv)
 {
+    int mask;
     int ret;
     int i, j;
     char *filename;
@@ -49,10 +48,7 @@ int main(int argc, char **argv)
     struct lustre_record_ref *lustre_rec_hash = NULL;
     char *mod_buf = NULL;
 
-    if (argc != 2)
-        usage(argv[0]);
-
-    filename = argv[1];
+    mask = parse_args(argc, argv, &filename);
 
     fd = darshan_log_open(filename);
     if (!fd)
@@ -195,11 +191,44 @@ int main(int argc, char **argv)
             continue;
 
         /* print warning if this module only stored partial data */
-        if(DARSHAN_MOD_FLAG_ISSET(fd->partial_flag, i))
-            printf("\n# *WARNING*: The %s module contains incomplete data!\n"
-                   "#            This happens when a module runs out of\n"
-                   "#            memory to store new record data.\n",
-                   darshan_module_names[i]);
+        if(DARSHAN_MOD_FLAG_ISSET(fd->partial_flag, i)) {
+            if(mask & OPTION_SHOW_INCOMPLETE)
+            {
+                /* user requested that we show the data we have anyway */
+                printf("\n# *WARNING*: "
+                       "The %s module contains incomplete data!\n"
+                       "#            This happens when a module runs out of\n"
+                       "#            memory to store new record data.\n",
+                       darshan_module_names[i]);
+                printf(
+                       "\n# To avoid this error, consult the darshan-runtime\n"
+                       "# documentation and consider setting the\n"
+                       "# DARSHAN_EXCLUDE_DIRS or DXT_TRIGGER_CONF_PATH\n"
+                       "# environment variable to prevent Darshan from\n"
+                       "# instrumenting unecessary files.\n");
+            }
+            else
+            {
+                /* hard error */
+                fprintf(stderr, "\n# *ERROR*: "
+                       "The %s module contains incomplete data!\n"
+                       "#            This happens when a module runs out of\n"
+                       "#            memory to store new record data.\n",
+                       darshan_module_names[i]);
+                fprintf(stderr,
+                       "\n# To avoid this error, consult the darshan-runtime\n"
+                       "# documentation and consider setting the\n"
+                       "# DARSHAN_EXCLUDE_DIRS or DXT_TRIGGER_CONF_PATH\n"
+                       "# environment variable to prevent Darshan from\n"
+                       "# instrumenting unecessary files.\n");
+                fprintf(stderr,
+                        "\n# You can display the (incomplete) data that is\n"
+                        "# present in this log using the --show-incomplete\n"
+                        "# option to darshan-dxt-parser.\n");
+                return(-1);
+            }
+
+        }
 
         /* loop over each of this module's records and print them */
         while(1)
@@ -311,6 +340,59 @@ cleanup:
     return(ret);
 }
 
+static int parse_args (int argc, char **argv, char **filename)
+{
+    int index;
+    int mask;
+    static struct option long_opts[] =
+    {
+        {"show-incomplete", 0, NULL, OPTION_SHOW_INCOMPLETE},
+        {"help",  0, NULL, 0},
+        {0, 0, 0, 0}
+    };
+
+    mask = 0;
+
+    while(1)
+    {
+        int c = getopt_long(argc, argv, "", long_opts, &index);
+
+        if (c == -1) break;
+
+        switch(c)
+        {
+            case OPTION_SHOW_INCOMPLETE:
+                mask |= c;
+                break;
+            case 0:
+            case '?':
+            default:
+                usage(argv[0]);
+                break;
+        }
+    }
+
+    if (optind < argc)
+    {
+        *filename = argv[optind];
+    }
+    else
+    {
+        usage(argv[0]);
+    }
+
+    return mask;
+}
+
+static int usage (char *exename)
+{
+    fprintf(stderr, "Usage: %s [options] <filename>\n", exename);
+    fprintf(stderr, "    --show-incomplete : display results even if log is incomplete\n");
+
+    exit(1);
+}
+
+
 /*
  * Local variables:
  *  c-indent-level: 4


=====================================
darshan-util/darshan-job-summary/bin/darshan-job-summary.pl.in
=====================================
@@ -54,7 +54,10 @@ if ($verbose_flag)
     print "verbose: $tmp_dir\n";
 }
 
-open(PARSE_OUT, "$darshan_parser --base --perf $input_file |") || die("Can't execute \"$darshan_parser $input_file\": $!\n");
+# Note that we use the --show-incomplete option here because
+# darshan-job-summary will display a clear warning if this condition is
+# encountered anyway.
+open(PARSE_OUT, "$darshan_parser --show-incomplete --base --perf $input_file |") || die("Can't execute \"$darshan_parser $input_file\": $!\n");
 
 open(FA_READ, ">$tmp_dir/file-access-read.dat") || die("error opening output file: $!\n");
 open(FA_WRITE, ">$tmp_dir/file-access-write.dat") || die("error opening output file: $!\n");


=====================================
darshan-util/darshan-job-summary/share/summary.tex
=====================================
@@ -37,8 +37,9 @@
 \twocolumn[
 \vspace{3.5in}
 \center
-{\bf \textcolor{red}{WARNING}}: This Darshan log contains incomplete data
-which may skew results in this document.
+{\bf \textcolor{red}{WARNING}}: This Darshan log contains incomplete data.
+This happens when a module runs out of memory to store new record data.
+Please run darshan-parser on the log file for more information.
 \endcenter
 ]
 \newpage


=====================================
darshan-util/darshan-parser.c
=====================================
@@ -29,13 +29,15 @@
 #define OPTION_FILE  (1 << 3)  /* file count totals */
 #define OPTION_FILE_LIST  (1 << 4)  /* per-file summaries */
 #define OPTION_FILE_LIST_DETAILED  (1 << 6)  /* per-file summaries with extra detail */
+#define OPTION_SHOW_INCOMPLETE  (1 << 7)  /* show what we have, even if log is incomplete */
 #define OPTION_ALL (\
   OPTION_BASE|\
   OPTION_TOTAL|\
   OPTION_PERF|\
   OPTION_FILE|\
   OPTION_FILE_LIST|\
-  OPTION_FILE_LIST_DETAILED)
+  OPTION_FILE_LIST_DETAILED|\
+  OPTION_SHOW_INCOMPLETE)
 
 #define FILETYPE_SHARED (1 << 0)
 #define FILETYPE_UNIQUE (1 << 1)
@@ -131,6 +133,7 @@ int usage (char *exename)
     fprintf(stderr, "    --file-list-detailed  : per-file summaries with additional detail\n");
     fprintf(stderr, "    --perf  : derived perf data\n");
     fprintf(stderr, "    --total : aggregated darshan field data\n");
+    fprintf(stderr, "    --show-incomplete : display results even if log is incomplete\n");
 
     exit(1);
 }
@@ -148,6 +151,7 @@ int parse_args (int argc, char **argv, char **filename)
         {"file-list-detailed",  0, NULL, OPTION_FILE_LIST_DETAILED},
         {"perf",  0, NULL, OPTION_PERF},
         {"total", 0, NULL, OPTION_TOTAL},
+        {"show-incomplete", 0, NULL, OPTION_SHOW_INCOMPLETE},
         {"help",  0, NULL, 0},
         {0, 0, 0, 0}
     };
@@ -169,6 +173,7 @@ int parse_args (int argc, char **argv, char **filename)
             case OPTION_FILE_LIST_DETAILED:
             case OPTION_PERF:
             case OPTION_TOTAL:
+            case OPTION_SHOW_INCOMPLETE:
                 mask |= c;
                 break;
             case 0:
@@ -189,9 +194,9 @@ int parse_args (int argc, char **argv, char **filename)
     }
 
     /* default mask value if none specified */
-    if (mask == 0)
+    if (mask == 0 || mask == OPTION_SHOW_INCOMPLETE)
     {
-        mask = OPTION_BASE;
+        mask |= OPTION_BASE;
     }
 
     return mask;
@@ -326,7 +331,7 @@ int main(int argc, char **argv)
     printf("# record table: %zu bytes (compressed)\n", fd->name_map.len);
     for(i=0; i<DARSHAN_MAX_MODS; i++)
     {
-        if(fd->mod_map[i].len)
+        if(fd->mod_map[i].len || DARSHAN_MOD_FLAG_ISSET(fd->partial_flag, i))
         {
             printf("# %s module: %zu bytes (compressed), ver=%d\n",
                 darshan_module_names[i], fd->mod_map[i].len, fd->mod_ver[i]);
@@ -357,7 +362,6 @@ int main(int argc, char **argv)
         printf("#   <fs type>: type of file system that the file resides on.\n");
     }
 
-    /* warn user if this log file is incomplete */
     pdata.rank_cumul_io_time = malloc(sizeof(double)*job.nprocs);
     pdata.rank_cumul_md_time = malloc(sizeof(double)*job.nprocs);
     if (!pdata.rank_cumul_io_time || !pdata.rank_cumul_md_time)
@@ -386,7 +390,8 @@ int main(int argc, char **argv)
         if(fd->mod_map[i].len == 0)
         {
             empty_mods++;
-            continue;
+            if(!DARSHAN_MOD_FLAG_ISSET(fd->partial_flag, i))
+                continue;
         }
         /* skip modules with no logutil definitions */
         else if(!mod_logutils[i])
@@ -413,11 +418,43 @@ int main(int argc, char **argv)
         printf("# *******************************************************\n");
 
         /* print warning if this module only stored partial data */
-        if(DARSHAN_MOD_FLAG_ISSET(fd->partial_flag, i))
-            printf("\n# *WARNING*: The %s module contains incomplete data!\n"
-                   "#            This happens when a module runs out of\n"
-                   "#            memory to store new record data.\n",
-                   darshan_module_names[i]);
+        if(DARSHAN_MOD_FLAG_ISSET(fd->partial_flag, i)) {
+            if(mask & OPTION_SHOW_INCOMPLETE)
+            {
+                /* user requested that we show the data we have anyway */
+                printf("\n# *WARNING*: "
+                       "The %s module contains incomplete data!\n"
+                       "#            This happens when a module runs out of\n"
+                       "#            memory to store new record data.\n",
+                       darshan_module_names[i]);
+                printf(
+                       "\n# To avoid this error, consult the darshan-runtime\n"
+                       "# documentation and consider setting the\n"
+                       "# DARSHAN_EXCLUDE_DIRS environment variable to prevent\n"
+                       "# Darshan from instrumenting unecessary files.\n");
+                if(fd->mod_map[i].len == 0)
+                    continue; // no data to parse
+            }
+            else
+            {
+                /* hard error */
+                fprintf(stderr, "\n# *ERROR*: "
+                       "The %s module contains incomplete data!\n"
+                       "#            This happens when a module runs out of\n"
+                       "#            memory to store new record data.\n",
+                       darshan_module_names[i]);
+                fprintf(stderr,
+                       "\n# To avoid this error, consult the darshan-runtime\n"
+                       "# documentation and consider setting the\n"
+                       "# DARSHAN_EXCLUDE_DIRS environment variable to prevent\n"
+                       "# Darshan from instrumenting unecessary files.\n");
+                fprintf(stderr,
+                        "\n# You can display the (incomplete) data that is\n"
+                        "# present in this log using the --show-incomplete\n"
+                        "# option to darshan-parser.\n");
+                return(-1);
+            }
+        }
 
         if(mask & OPTION_BASE)
         {



View it on GitLab: https://xgitlab.cels.anl.gov/darshan/darshan/-/compare/80c88c161ecbf3447f06f28007323d070b88f64d...7d4a80abaa2f2bfcc767f4f988fbf9dafff95926

-- 
View it on GitLab: https://xgitlab.cels.anl.gov/darshan/darshan/-/compare/80c88c161ecbf3447f06f28007323d070b88f64d...7d4a80abaa2f2bfcc767f4f988fbf9dafff95926
You're receiving this email because of your account on xgitlab.cels.anl.gov.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.mcs.anl.gov/pipermail/darshan-commits/attachments/20210316/add9c5b4/attachment-0001.html>


More information about the Darshan-commits mailing list