[Swift-commit] r2879 - SwiftApps/pc3

Thu Apr 23 11:37:47 CDT 2009

Author: benc
Date: 2009-04-23 11:37:47 -0500 (Thu, 23 Apr 2009)
New Revision: 2879

Modified:
   SwiftApps/pc3/pc3.swift
Log:
notes in pc3.swift

Modified: SwiftApps/pc3/pc3.swift
===================================================================

--- SwiftApps/pc3/pc3.swift	2009-04-23 16:14:52 UTC (rev 2878)
+++ SwiftApps/pc3/pc3.swift	2009-04-23 16:37:47 UTC (rev 2879)
@@ -126,24 +126,44 @@
 int entries;
 xmlfile split_list_output[];
 
+// check that the batch directory and manifest are ready
 is_csv_ready_file_exists_output = is_csv_ready_file_exists(csv_root_path_input);
 
-  read_csv_ready_file_output = read_csv_ready_file(csv_root_path_input);
-  is_match_csv_file_tables_output = is_match_csv_file_tables(read_csv_ready_file_output);
-  if(is_match_csv_file_tables_output) {
+// read_csv_ready_file_output will contain a list of csv file entries,
+// with each containing the path of a csv, a path to a header file,
+// the rowcount for that file, the target table name, and a checksum
+read_csv_ready_file_output = read_csv_ready_file(csv_root_path_input);
+
+// check that for each csv file entry, we have a corresponding csv file
+is_match_csv_file_tables_output = is_match_csv_file_tables(read_csv_ready_file_output);
+if(is_match_csv_file_tables_output) {
+
+    // These variables are used to sequence database accesses in a dataflow
+    // style, rather than using imperative sequencing operations. With a
+    // database that allowed parallel accesses, we would not need to have
+    // such tight sequencing, though we would still need some.
     external db_over_time[];    
-    external dbinit; // some bug in analysis means can't use db_over_time for initial one
+    external dbinit; // some bug in Swift static analysis means can't use db_over_time for initial one
+
+
+    // create_empty_load_db_output contains a reference to the
+    // newly created database
     (create_empty_load_db_output, dbinit) = create_empty_load_db(job_id);
+
+    // count the entries in read_csv_ready_file_output
+    // TODO could this table be read into a Swift struct array?
     count_entries_output = count_entries(read_csv_ready_file_output);
     entries = readData(count_entries_output);
     int entries_seq[] = [1:entries];
-    
+   
+    // splitlistoutput[] will contain one element for each entry in
+    // read_csv_ready_file_output 
     foreach i in entries_seq {
       split_list_output[i] = extract_entry(read_csv_ready_file_output, i);
     }    
 
-     
-// TODO this can merge with merge with above foreach
+// TODO this can merge with merge with above foreach, and split_list_output
+// does not need to be an array then
  
     foreach i in entries_seq {
       boolean is_exists_csv_file_output;
@@ -153,28 +173,57 @@
       boolean update_computed_columns_output;
       boolean is_match_table_row_count_output;
       boolean is_match_table_column_ranges_output;
-     
+
+      // check that the data files exist, and use an external to
+      // sequence this before read_csv_file_column_names
       is_exists_csv_file_output = is_exists_csv_file(split_list_output[i]);
       external thread6 = checkvalid(is_exists_csv_file_output);
-read_csv_file_column_names_output = read_csv_file_column_names(split_list_output[i], thread6);
+
+      // update the CSVFileEntry with column names read from the header file
+      // (which is itself listed in the CSVFileEntry)
+      read_csv_file_column_names_output = read_csv_file_column_names(split_list_output[i], thread6);
+
+      // check that the table column names match the csv column names
+      // TODO where is the database reference passed in? is there enough
+      // information in the CSVFileEntry to determine that? there is a
+      // table name at least... I guess I should look inside
       is_match_csv_file_column_names_output = is_match_csv_file_column_names(read_csv_file_column_names_output);
       external thread2 = checkvalid(is_match_csv_file_column_names_output);
 
-      if(i==1) { // first element...
+      // explicit external-based sequencing between checking table validity and
+      // loading into the DB
+
+      if(i==1) { // first element... see above note about not being
+                 // able to use db_over_time[1] for this.
         load_csv_file_into_table_output = load_csv_file_into_table(create_empty_load_db_output, read_csv_file_column_names_output, dbinit);
       } else {
 	     load_csv_file_into_table_output = load_csv_file_into_table(create_empty_load_db_output, read_csv_file_column_names_output, db_over_time[i]);
       }
       external thread3=checkvalid(load_csv_file_into_table_output);
+
+      // explicitly sequence the database load and the computed-column-update
+
+      // update the computed columns, and fail the workflow if this fails
       update_computed_columns_output = update_computed_columns(create_empty_load_db_output, read_csv_file_column_names_output, thread3);
       external thread4 = checkvalid(update_computed_columns_output);
+
+
+      // now check that we loaded in the right number of rows, and ranges
+      // TODO With a parallelised database, we could do these two checks in
+      // parallel, I think. But for now, we use 'thread1' to sequence them
+      // explicitly
+
       is_match_table_row_count_output = is_match_table_row_count(create_empty_load_db_output, read_csv_file_column_names_output, thread4);
       external thread1 = checkvalid(is_match_table_row_count_output);
       is_match_table_column_ranges_output = is_match_table_column_ranges(create_empty_load_db_output, read_csv_file_column_names_output, thread1);
       db_over_time[i+1] = checkvalid(is_match_table_column_ranges_output);
+
     }
+
+    // TODO if we did this in a more parallel fashion, could wait for the
+    // whole db_over_time array to be closed...
     compact_database(create_empty_load_db_output, db_over_time[entries+1]);
-  }
-  else {
+}
+else {
     stop();
-  }
+}