[Swift-commit] r2879 - SwiftApps/pc3
noreply at svn.ci.uchicago.edu
noreply at svn.ci.uchicago.edu
Thu Apr 23 11:37:47 CDT 2009
Author: benc
Date: 2009-04-23 11:37:47 -0500 (Thu, 23 Apr 2009)
New Revision: 2879
Modified:
SwiftApps/pc3/pc3.swift
Log:
notes in pc3.swift
Modified: SwiftApps/pc3/pc3.swift
===================================================================
--- SwiftApps/pc3/pc3.swift 2009-04-23 16:14:52 UTC (rev 2878)
+++ SwiftApps/pc3/pc3.swift 2009-04-23 16:37:47 UTC (rev 2879)
@@ -126,24 +126,44 @@
int entries;
xmlfile split_list_output[];
+// check that the batch directory and manifest are ready
is_csv_ready_file_exists_output = is_csv_ready_file_exists(csv_root_path_input);
- read_csv_ready_file_output = read_csv_ready_file(csv_root_path_input);
- is_match_csv_file_tables_output = is_match_csv_file_tables(read_csv_ready_file_output);
- if(is_match_csv_file_tables_output) {
+// read_csv_ready_file_output will contain a list of csv file entries,
+// with each containing the path of a csv, a path to a header file,
+// the rowcount for that file, the target table name, and a checksum
+read_csv_ready_file_output = read_csv_ready_file(csv_root_path_input);
+
+// check that for each csv file entry, we have a corresponding csv file
+is_match_csv_file_tables_output = is_match_csv_file_tables(read_csv_ready_file_output);
+if(is_match_csv_file_tables_output) {
+
+ // These variables are used to sequence database accesses in a dataflow
+ // style, rather than using imperative sequencing operations. With a
+ // database that allowed parallel accesses, we would not need to have
+ // such tight sequencing, though we would still need some.
external db_over_time[];
- external dbinit; // some bug in analysis means can't use db_over_time for initial one
+ external dbinit; // some bug in Swift static analysis means can't use db_over_time for initial one
+
+
+ // create_empty_load_db_output contains a reference to the
+ // newly created database
(create_empty_load_db_output, dbinit) = create_empty_load_db(job_id);
+
+ // count the entries in read_csv_ready_file_output
+ // TODO could this table be read into a Swift struct array?
count_entries_output = count_entries(read_csv_ready_file_output);
entries = readData(count_entries_output);
int entries_seq[] = [1:entries];
-
+
+ // splitlistoutput[] will contain one element for each entry in
+ // read_csv_ready_file_output
foreach i in entries_seq {
split_list_output[i] = extract_entry(read_csv_ready_file_output, i);
}
-
-// TODO this can merge with merge with above foreach
+// TODO this can merge with merge with above foreach, and split_list_output
+// does not need to be an array then
foreach i in entries_seq {
boolean is_exists_csv_file_output;
@@ -153,28 +173,57 @@
boolean update_computed_columns_output;
boolean is_match_table_row_count_output;
boolean is_match_table_column_ranges_output;
-
+
+ // check that the data files exist, and use an external to
+ // sequence this before read_csv_file_column_names
is_exists_csv_file_output = is_exists_csv_file(split_list_output[i]);
external thread6 = checkvalid(is_exists_csv_file_output);
-read_csv_file_column_names_output = read_csv_file_column_names(split_list_output[i], thread6);
+
+ // update the CSVFileEntry with column names read from the header file
+ // (which is itself listed in the CSVFileEntry)
+ read_csv_file_column_names_output = read_csv_file_column_names(split_list_output[i], thread6);
+
+ // check that the table column names match the csv column names
+ // TODO where is the database reference passed in? is there enough
+ // information in the CSVFileEntry to determine that? there is a
+ // table name at least... I guess I should look inside
is_match_csv_file_column_names_output = is_match_csv_file_column_names(read_csv_file_column_names_output);
external thread2 = checkvalid(is_match_csv_file_column_names_output);
- if(i==1) { // first element...
+ // explicit external-based sequencing between checking table validity and
+ // loading into the DB
+
+ if(i==1) { // first element... see above note about not being
+ // able to use db_over_time[1] for this.
load_csv_file_into_table_output = load_csv_file_into_table(create_empty_load_db_output, read_csv_file_column_names_output, dbinit);
} else {
load_csv_file_into_table_output = load_csv_file_into_table(create_empty_load_db_output, read_csv_file_column_names_output, db_over_time[i]);
}
external thread3=checkvalid(load_csv_file_into_table_output);
+
+ // explicitly sequence the database load and the computed-column-update
+
+ // update the computed columns, and fail the workflow if this fails
update_computed_columns_output = update_computed_columns(create_empty_load_db_output, read_csv_file_column_names_output, thread3);
external thread4 = checkvalid(update_computed_columns_output);
+
+
+ // now check that we loaded in the right number of rows, and ranges
+ // TODO With a parallelised database, we could do these two checks in
+ // parallel, I think. But for now, we use 'thread1' to sequence them
+ // explicitly
+
is_match_table_row_count_output = is_match_table_row_count(create_empty_load_db_output, read_csv_file_column_names_output, thread4);
external thread1 = checkvalid(is_match_table_row_count_output);
is_match_table_column_ranges_output = is_match_table_column_ranges(create_empty_load_db_output, read_csv_file_column_names_output, thread1);
db_over_time[i+1] = checkvalid(is_match_table_column_ranges_output);
+
}
+
+ // TODO if we did this in a more parallel fashion, could wait for the
+ // whole db_over_time array to be closed...
compact_database(create_empty_load_db_output, db_over_time[entries+1]);
- }
- else {
+}
+else {
stop();
- }
+}
More information about the Swift-commit
mailing list