[Darshan-commits] [Darshan] branch, dev-modular, updated. abf424f2859b8ea43a4818a2c086531e39134dd4
Service Account
git at mcs.anl.gov
Fri Jan 16 11:55:00 CST 2015
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "".
The branch, dev-modular has been updated
via abf424f2859b8ea43a4818a2c086531e39134dd4 (commit)
from da477e427153126228171de95ea8d9a3a78e0a1a (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
commit abf424f2859b8ea43a4818a2c086531e39134dd4
Author: Shane Snyder <ssnyder at mcs.anl.gov>
Date: Fri Jan 16 11:54:03 2015 -0600
More changes to support darshan_core shutdown
-----------------------------------------------------------------------
Summary of changes:
darshan-runtime/darshan-core.h | 15 ++-
darshan-runtime/darshan.h | 12 +-
darshan-runtime/lib/darshan-core.c | 228 +++++++++++++++++++++++++++--------
darshan-runtime/lib/darshan-posix.c | 14 +-
4 files changed, 204 insertions(+), 65 deletions(-)
Diff of changes:
diff --git a/darshan-runtime/darshan-core.h b/darshan-runtime/darshan-core.h
index ec853c0..955de0b 100644
--- a/darshan-runtime/darshan-core.h
+++ b/darshan-runtime/darshan-core.h
@@ -12,6 +12,9 @@
#include "darshan.h"
+/* TODO: enforce this when handing out ids */
+#define DARSHAN_CORE_MAX_RECORDS 1024
+
struct darshan_core_module
{
darshan_module_id id;
@@ -21,12 +24,20 @@ struct darshan_core_module
/* in memory structure to keep up with job level data */
/* TODO: trailing data ? */
-struct darshan_core_job_runtime
+struct darshan_core_runtime
{
struct darshan_job log_job;
- struct darshan_core_module* mod_array[DARSHAN_MAX_MODS];
char exe[CP_EXE_LEN+1];
double wtime_offset;
+ struct darshan_core_record_ref *rec_hash;
+ struct darshan_core_module* mod_array[DARSHAN_MAX_MODS];
+};
+
+struct darshan_core_record_ref
+{
+ char* name;
+ darshan_record_id id;
+ UT_hash_handle hlink;
};
#endif /* __DARSHAN_CORE_H */
diff --git a/darshan-runtime/darshan.h b/darshan-runtime/darshan.h
index ee3ead6..8e86e35 100644
--- a/darshan-runtime/darshan.h
+++ b/darshan-runtime/darshan.h
@@ -44,7 +44,7 @@ typedef enum
DARSHAN_PNETCDF_MOD,
} darshan_module_id;
-typedef uint64_t darshan_file_id;
+typedef uint64_t darshan_record_id;
struct darshan_module_funcs
{
@@ -56,9 +56,9 @@ struct darshan_module_funcs
void (*shutdown)(void);
};
-/*********************************************
-* darshan-core functions for darshan modules *
-*********************************************/
+/*****************************************************
+* darshan-core functions exported to darshan modules *
+*****************************************************/
void darshan_core_register_module(
darshan_module_id id,
@@ -66,11 +66,11 @@ void darshan_core_register_module(
struct darshan_module_funcs *funcs,
int *runtime_mem_limit);
-void darshan_core_lookup_id(
+void darshan_core_lookup_record_id(
void *name,
int len,
int printable_flag,
- darshan_file_id *id);
+ darshan_record_id *id);
double darshan_core_wtime(void);
diff --git a/darshan-runtime/lib/darshan-core.c b/darshan-runtime/lib/darshan-core.c
index 27df73a..89c0cca 100644
--- a/darshan-runtime/lib/darshan-core.c
+++ b/darshan-runtime/lib/darshan-core.c
@@ -21,25 +21,28 @@
#include <sys/vfs.h>
#include <mpi.h>
+#include "uthash.h"
#include "darshan-core.h"
/* TODO is __progname_full needed here */
extern char* __progname;
/* internal variables */
-static struct darshan_core_job_runtime *darshan_core_job = NULL;
-static pthread_mutex_t darshan_mutex = PTHREAD_MUTEX_INITIALIZER;
+static struct darshan_core_runtime *darshan_core_job = NULL;
+static pthread_mutex_t darshan_core_mutex = PTHREAD_MUTEX_INITIALIZER;
static int my_rank = -1;
static int nprocs = -1;
static void darshan_core_initialize(int *argc, char ***argv);
static void darshan_core_shutdown(void);
-static void darshan_core_cleanup(struct darshan_core_job_runtime* job);
+static void darshan_core_cleanup(struct darshan_core_runtime* job);
static void darshan_get_logfile_name(char* logfile_name, int jobid, struct tm* start_tm);
-static void darshan_log_record_hints_and_ver(struct darshan_core_job_runtime* job);
+static void darshan_log_record_hints_and_ver(struct darshan_core_runtime* job);
+static int darshan_get_shared_record_ids(darshan_record_id *shared_recs);
+static void darshan_write_record_map(void);
-#define DARSHAN_LOCK() pthread_mutex_lock(&darshan_mutex)
-#define DARSHAN_UNLOCK() pthread_mutex_unlock(&darshan_mutex)
+#define DARSHAN_CORE_LOCK() pthread_mutex_lock(&darshan_core_mutex)
+#define DARSHAN_CORE_UNLOCK() pthread_mutex_unlock(&darshan_core_mutex)
/* intercept MPI initialize and finalize to manage darshan core runtime */
int MPI_Init(int *argc, char ***argv)
@@ -168,14 +171,15 @@ static void darshan_core_initialize(int *argc, char ***argv)
static void darshan_core_shutdown()
{
+ int i;
char *logfile_name;
- struct darshan_core_job_runtime* final_job;
+ struct darshan_core_runtime *final_job;
struct darshan_core_module *mod, *tmp;
int internal_timing_flag = 0;
- char* envjobid;
- char* jobid_str;
+ char *envjobid;
+ char *jobid_str;
int jobid;
- struct tm* start_tm;
+ struct tm *start_tm;
time_t start_time_tmp;
int ret = 0;
int all_ret = 0;
@@ -183,15 +187,15 @@ static void darshan_core_shutdown()
int64_t last_end_time;
int local_mod_use[DARSHAN_MAX_MODS] = {0};
int global_mod_use_count[DARSHAN_MAX_MODS] = {0};
- int i;
- char* key;
- char* value;
- char* hints;
- char* tok_str;
- char* orig_tok_str;
- char* saveptr = NULL;
- char* mod_index;
- char* new_logfile_name;
+ darshan_record_id shared_recs[DARSHAN_CORE_MAX_RECORDS] = {0};
+ char *key;
+ char *value;
+ char *hints;
+ char *tok_str;
+ char *orig_tok_str;
+ char *saveptr = NULL;
+ char *mod_index;
+ char *new_logfile_name;
double start_log_time;
double end_log_time;
long offset;
@@ -202,10 +206,10 @@ static void darshan_core_shutdown()
if(getenv("DARSHAN_INTERNAL_TIMING"))
internal_timing_flag = 1;
- DARSHAN_LOCK();
+ DARSHAN_CORE_LOCK();
if(!darshan_core_job)
{
- DARSHAN_UNLOCK();
+ DARSHAN_CORE_UNLOCK();
return;
}
/* disable further tracing while hanging onto the data so that we can
@@ -213,7 +217,7 @@ static void darshan_core_shutdown()
*/
final_job = darshan_core_job;
darshan_core_job = NULL;
- DARSHAN_UNLOCK();
+ DARSHAN_CORE_UNLOCK();
start_log_time = DARSHAN_MPI_CALL(PMPI_Wtime)();
@@ -296,11 +300,27 @@ static void darshan_core_shutdown()
/* reduce the number of times a module was opened globally and bcast to everyone */
DARSHAN_MPI_CALL(PMPI_Allreduce)(local_mod_use, global_mod_use_count, DARSHAN_MAX_MODS, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
- MPI_Info_create(&info);
+ /* get a list of records which are shared across all processes */
+ ret = darshan_get_shared_record_ids(shared_recs);
+ DARSHAN_MPI_CALL(PMPI_Allreduce)(&ret, &all_ret, 1, MPI_INT,
+ MPI_LOR, MPI_COMM_WORLD);
+ if(all_ret != 0)
+ {
+ if(my_rank == 0)
+ {
+ fprintf(stderr, "darshan library warning: unable to determine shared file records\n");
+ }
+ free(logfile_name);
+ darshan_core_cleanup(final_job);
+ return;
+
+ }
/* check environment variable to see if the default MPI file hints have
* been overridden
*/
+ MPI_Info_create(&info);
+
hints = getenv(CP_LOG_HINTS_OVERRIDE);
if(!hints)
{
@@ -361,34 +381,34 @@ static void darshan_core_shutdown()
return;
}
- /* TODO: is there another header, or is job info first data ? */
- /* TODO: are MPI data types necessary or can we just write buffers of MPI_BYTEs? */
-
- /* write the job info on rank 0 */
- if(my_rank == 0)
+ /* reserve space at beginning of darshan log for uncompressed header using seek */
+ /* NOTE: the header includes the the darshan job struct and the module indices map */
+ MPI_Offset header_end = sizeof(struct darshan_job);
+ /* header_end += (); TODO: how much do i leave for the indices map? */
+ ret = DARSHAN_MPI_CALL(PMPI_File_seek)(log_fh, header_end, MPI_SEEK_SET);
+ if(ret != MPI_SUCCESS)
{
- ret = DARSHAN_MPI_CALL(PMPI_File_write_at)(log_fh, 0, &(final_job->log_job),
- sizeof(struct darshan_job), MPI_BYTE, &status);
- if(ret != MPI_SUCCESS)
+ if(my_rank == 0)
{
int msg_len;
char msg[MPI_MAX_ERROR_STRING] = {0};
MPI_Error_string(ret, msg, &msg_len);
- fprintf(stderr, "darshan library warning: unable to write job data to log file %s: %s\n",
+ fprintf(stderr, "darshan library warning: unable to seek in log file %s: %s\n",
logfile_name, msg);
unlink(logfile_name);
- free(logfile_name);
- darshan_core_cleanup(final_job);
- return;
}
+ free(logfile_name);
+ darshan_core_cleanup(final_job);
+ return;
}
- /* TODO: id->file name map write */
+ /* TODO implement */
+ darshan_write_record_map();
/* loop over globally used darshan modules and:
* - get final output buffer
- * - compress (zlib/bzip2) provided output buffer
+ * - compress (zlib) provided output buffer
* - write compressed buffer to log file
* - shutdown the module
*/
@@ -492,6 +512,26 @@ static void darshan_core_shutdown()
MPI_Comm_free(&mod_comm);
}
+ /* TODO: is this still right? -- write the job info on rank 0 */
+ if(my_rank == 0)
+ {
+ ret = DARSHAN_MPI_CALL(PMPI_File_write_at)(log_fh, 0, &(final_job->log_job),
+ sizeof(struct darshan_job), MPI_BYTE, &status);
+ if(ret != MPI_SUCCESS)
+ {
+ int msg_len;
+ char msg[MPI_MAX_ERROR_STRING] = {0};
+
+ MPI_Error_string(ret, msg, &msg_len);
+ fprintf(stderr, "darshan library warning: unable to write job data to log file %s: %s\n",
+ logfile_name, msg);
+ unlink(logfile_name);
+ free(logfile_name);
+ darshan_core_cleanup(final_job);
+ return;
+ }
+ }
+
DARSHAN_MPI_CALL(PMPI_File_close)(&log_fh);
/* if we got this far, there are no errors, so rename from *.darshan_partial
@@ -516,8 +556,8 @@ static void darshan_core_shutdown()
free(new_logfile_name);
}
- free(logfile_name);
darshan_core_cleanup(final_job);
+ free(logfile_name);
if(internal_timing_flag)
{
@@ -528,7 +568,7 @@ static void darshan_core_shutdown()
}
/* free darshan core data structures to shutdown */
-static void darshan_core_cleanup(struct darshan_core_job_runtime* job)
+static void darshan_core_cleanup(struct darshan_core_runtime* job)
{
int i;
@@ -683,7 +723,7 @@ static void darshan_get_logfile_name(char* logfile_name, int jobid, struct tm* s
}
/* record any hints used to write the darshan log in the log header */
-static void darshan_log_record_hints_and_ver(struct darshan_core_job_runtime* job)
+static void darshan_log_record_hints_and_ver(struct darshan_core_runtime* job)
{
char* hints;
char* header_hints;
@@ -728,6 +768,74 @@ static void darshan_log_record_hints_and_ver(struct darshan_core_job_runtime* jo
return;
}
+static int darshan_get_shared_record_ids(darshan_record_id *shared_recs)
+{
+ int i;
+ int ndx;
+ int ret;
+ struct darshan_core_record_ref *ref, *tmp;
+ darshan_record_id id_array[DARSHAN_CORE_MAX_RECORDS] = {0};
+ darshan_record_id mask_array[DARSHAN_CORE_MAX_RECORDS] = {0};
+ darshan_record_id all_mask_array[DARSHAN_CORE_MAX_RECORDS] = {0};
+
+ /* first, determine list of records root process has opened */
+ if(my_rank == 0)
+ {
+ ndx = 0;
+ HASH_ITER(hlink, darshan_core_job->rec_hash, ref, tmp)
+ {
+ id_array[ndx++] = ref->id;
+ }
+ }
+
+ /* broadcast root's list of records to all other processes */
+ ret = DARSHAN_MPI_CALL(PMPI_Bcast)(id_array,
+ (DARSHAN_CORE_MAX_RECORDS * sizeof(darshan_record_id)),
+ MPI_BYTE, 0, MPI_COMM_WORLD);
+ if(ret != 0)
+ {
+ return -1;
+ }
+
+ /* everyone looks to see if they opened the same records as root */
+ for(i=0; (i<DARSHAN_CORE_MAX_RECORDS && id_array[i] != 0); i++)
+ {
+ HASH_ITER(hlink, darshan_core_job->rec_hash, ref, tmp)
+ {
+ if(id_array[i] == ref->id)
+ {
+ /* we opened that record too */
+ mask_array[i] = 1;
+ break;
+ }
+ }
+ }
+
+ /* now allreduce so everyone agrees which files are shared */
+ ret = DARSHAN_MPI_CALL(PMPI_Allreduce)(mask_array, all_mask_array,
+ DARSHAN_CORE_MAX_RECORDS, MPI_INT, MPI_LAND, MPI_COMM_WORLD);
+ if(ret != 0)
+ {
+ return -1;
+ }
+
+ ndx = 0;
+ for(i=0; (i<DARSHAN_CORE_MAX_RECORDS && id_array[i] != 0); i++)
+ {
+ if(all_mask_array[i] != 0)
+ {
+ shared_recs[ndx++] = id_array[i];
+ }
+ }
+
+ return 0;
+}
+
+static void darshan_write_record_map()
+{
+ return;
+}
+
/* ********************************************************* */
void darshan_core_register_module(
@@ -738,12 +846,12 @@ void darshan_core_register_module(
{
struct darshan_core_module* mod;
- DARSHAN_LOCK();
+ DARSHAN_CORE_LOCK();
*runtime_mem_limit = 0;
if(!darshan_core_job || (id >= DARSHAN_MAX_MODS))
{
- DARSHAN_UNLOCK();
+ DARSHAN_CORE_UNLOCK();
return;
}
@@ -752,7 +860,7 @@ void darshan_core_register_module(
{
/* if module is already registered just return */
/* NOTE: we do not recalculate memory limit here, just set to 0 */
- DARSHAN_UNLOCK();
+ DARSHAN_CORE_UNLOCK();
return;
}
@@ -760,7 +868,7 @@ void darshan_core_register_module(
mod = malloc(sizeof(*mod));
if(!mod)
{
- DARSHAN_UNLOCK();
+ DARSHAN_CORE_UNLOCK();
return;
}
memset(mod, 0, sizeof(*mod));
@@ -775,18 +883,19 @@ void darshan_core_register_module(
/* TODO: something smarter than just 2 MiB per module */
*runtime_mem_limit = 2 * 1024 * 1024;
- DARSHAN_UNLOCK();
+ DARSHAN_CORE_UNLOCK();
return;
}
-void darshan_core_lookup_id(
+void darshan_core_lookup_record_id(
void *name,
int len,
int printable_flag,
- darshan_file_id *id)
+ darshan_record_id *id)
{
- darshan_file_id tmp_id;
+ darshan_record_id tmp_id;
+ struct darshan_core_record_ref* ref;
if(!darshan_core_job)
return;
@@ -795,8 +904,27 @@ void darshan_core_lookup_id(
/* hash the input name to get a unique id for this record */
tmp_id = darshan_hash(name, len, 0);
-
- /* TODO: how to store the filename to hash mapping? */
+
+ DARSHAN_CORE_LOCK();
+
+ /* check to see if we've already stored the id->name mapping for this record */
+ HASH_FIND(hlink, darshan_core_job->rec_hash, &tmp_id, sizeof(darshan_record_id), ref);
+ if(!ref)
+ {
+ /* if not, add this record to the hash */
+ ref = malloc(sizeof(struct darshan_core_record_ref));
+ if(ref)
+ {
+ ref->id = tmp_id;
+ ref->name = malloc(strlen(name) + 1);
+ if(ref->name)
+ strcpy(ref->name, name);
+
+ HASH_ADD(hlink, darshan_core_job->rec_hash, id, sizeof(darshan_record_id), ref);
+ }
+ }
+
+ DARSHAN_CORE_UNLOCK();
*id = tmp_id;
return;
diff --git a/darshan-runtime/lib/darshan-posix.c b/darshan-runtime/lib/darshan-posix.c
index cbf3fd3..7ba3ad3 100644
--- a/darshan-runtime/lib/darshan-posix.c
+++ b/darshan-runtime/lib/darshan-posix.c
@@ -25,8 +25,8 @@
#include <aio.h>
#include <pthread.h>
-#include "darshan.h"
#include "uthash.h"
+#include "darshan.h"
#ifndef HAVE_OFF64_T
typedef int64_t off64_t;
@@ -159,7 +159,7 @@ enum darshan_f_posix_indices
struct darshan_posix_file
{
- darshan_file_id f_id;
+ darshan_record_id f_id;
int64_t rank;
int64_t counters[CP_NUM_INDICES];
double fcounters[CP_F_NUM_INDICES];
@@ -433,7 +433,7 @@ static struct posix_runtime_file* posix_file_by_name(const char *name)
{
struct posix_runtime_file *file = NULL;
char *newname = NULL;
- darshan_file_id file_id;
+ darshan_record_id file_id;
if(!posix_runtime)
return(NULL);
@@ -443,14 +443,14 @@ static struct posix_runtime_file* posix_file_by_name(const char *name)
newname = (char*)name;
/* get a unique id for this file from darshan core */
- darshan_core_lookup_id(
+ darshan_core_lookup_record_id(
(void*)newname,
strlen(newname),
1,
&file_id);
/* search the hash table for this file record, and return if found */
- HASH_FIND(hlink, posix_runtime->file_hash, &file_id, sizeof(darshan_file_id), file);
+ HASH_FIND(hlink, posix_runtime->file_hash, &file_id, sizeof(darshan_record_id), file);
if(file)
{
if(newname != name)
@@ -466,7 +466,7 @@ static struct posix_runtime_file* posix_file_by_name(const char *name)
file->file_record->f_id = file_id;
/* add new record to file hash table */
- HASH_ADD(hlink, posix_runtime->file_hash, file_record->f_id, sizeof(darshan_file_id), file);
+ HASH_ADD(hlink, posix_runtime->file_hash, file_record->f_id, sizeof(darshan_record_id), file);
posix_runtime->file_array_ndx++;
}
@@ -518,7 +518,7 @@ static struct posix_runtime_file* posix_file_by_name_setfd(const char* name, int
static void posix_file_close_fd(int fd)
{
- struct posix_runtime_file_ref *ref;
+ struct posix_runtime_file_ref* ref;
if(!posix_runtime)
return;
hooks/post-receive
--
More information about the Darshan-commits
mailing list