diff --git a/README.md b/README.md index 611b96aa2caf3725eda6510e9168543015c4b888..0c86fb3c424825f601e85083c8b5f0909fa71c55 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,9 @@ found [here](https://github.com/CH-Earth/summa#readme). SUMMA-Actors is a modifi of SUMMA that uses the Actor Model to increase scalability and fault-tolerance. The actor which is known as the basic unit of concurrent computation is at the heart of this software. SUMMA-Actors is built using the [C++ Actor Framework](https://github.com/actor-framework/actor-framework). +## Documentation +A more in-depth documentation can be found [here](https://summa-actors.readthedocs.io/en/latest/) + ## Compiling Summa-Actors SUMMA-Actors is written in C++ and FORTRAN and can be compiled with a C++ and FORTRAN compiler from the GNU Compiler Collection. We have compiled SUMMA-Actors with the diff --git a/build/makefile b/build/makefile index 94e0ae83833e9a61693b125b9f8cc8feb243d08a..1b3442e5ed5c9f0f4422389888ea0fd6ea9ca4d8 100644 --- a/build/makefile +++ b/build/makefile @@ -14,34 +14,20 @@ # ACTORS_INCLUDES = # ACTORS_LIBRARIES = -# gfortran compiler flags -ifeq "$(FC)" "gfortran" + # Production runs -# FLAGS_NOAH = -O3 -ffree-form -ffree-line-length-none -fmax-errors=0 -fPIC -# FLAGS_COMM = -O3 -ffree-line-length-none -fmax-errors=0 -fPIC -# FLAGS_SUMMA = -O3 -ffree-line-length-none -fmax-errors=0 -fPIC -# FLAGS_ACTORS = -O3 +FLAGS_NOAH = -O3 -ffree-form -ffree-line-length-none -fmax-errors=0 -fPIC +FLAGS_COMM = -O3 -ffree-line-length-none -fmax-errors=0 -fPIC +FLAGS_SUMMA = -O3 -ffree-line-length-none -fmax-errors=0 -fPIC +FLAGS_ACTORS = -O3 # # Debug runs -FLAGS_NOAH = -p -g -O0 -ffree-form -ffree-line-length-none -fmax-errors=0 -fbacktrace -Wno-unused -Wno-unused-dummy-argument -fPIC -FLAGS_COMM = -p -g -O0 -Wall -ffree-line-length-none -fmax-errors=0 -fbacktrace -fcheck=bounds -fPIC -FLAGS_SUMMA = -p -g -O0 -Wall -ffree-line-length-none -fmax-errors=0 -fbacktrace -fcheck=bounds -fPIC -FLAGS_ACTORS = -g -O0 -Wall -endif - -# ifort compiler flags -ifeq "$(FC)" "ifort" - -# define compiler flags -# FLAGS_NOAH = -O3 -autodouble -warn nounused -noerror_limit -FR -auto -fltconsistency -fPIC -# FLAGS_COMM = -O3 -FR -auto -warn nounused -fltconsistency -fpe0 -fPIC -# FLAGS_SUMMA = -O3 -FR -auto -warn nounused -fltconsistency -fpe0 -fPIC - -# debug runs -FLAGS_NOAH = -O3 -g -autodouble -warn nounused -noerror_limit -FR -auto -fltconsistency -fPIC -FLAGS_COMM = -O3 -g -FR -auto -warn nounused -fltconsistency -fpe0 -fPIC -FLAGS_SUMMA = -O3 -g -FR -auto -warn nounused -fltconsistency -fpe0 -fPIC -endif +# FLAGS_NOAH = -p -g -O0 -ffree-form -ffree-line-length-none -fmax-errors=0 -fbacktrace -Wno-unused -Wno-unused-dummy-argument -fPIC +# FLAGS_COMM = -p -g -O0 -Wall -ffree-line-length-none -fmax-errors=0 -fbacktrace -fcheck=bounds -fPIC +# FLAGS_SUMMA = -p -g -O0 -Wall -ffree-line-length-none -fmax-errors=0 -fbacktrace -fcheck=bounds -fPIC +# FLAGS_ACTORS = -g -O0 -Wall + + #======================================================================== # PART 1: Define directory paths #======================================================================== diff --git a/build/source/actors/FileAccessActor.h b/build/source/actors/FileAccessActor.h index d805659f60158d6c6fb1b4bd3d143e4e7cbe8bd8..c8d51e6f245b18297d1fc0ddc4dddadd5c6d18db 100644 --- a/build/source/actors/FileAccessActor.h +++ b/build/source/actors/FileAccessActor.h @@ -15,10 +15,13 @@ behavior file_access_actor(stateful_actor<file_access_state>* self, int startGRU self->state.numGRU = numGRU; self->state.startGRU = startGRU; self->state.outputStrucSize = outputStrucSize; + + aout(self) << "\nFile Access Actor Started\n"; initalizeFileAccessActor(self); return { - [=](initalize_outputStrucure) { + [=](initalize_outputStructure) { + aout(self) << "Initalizing Output Structure" << std::endl; Init_OutputStruct(self->state.handle_forcFileInfo, &self->state.outputStrucSize, &self->state.numGRU, &self->state.err); }, @@ -152,30 +155,47 @@ void initalizeFileAccessActor(stateful_actor<file_access_state>* self) { // aout(self) << "Set Up the forcing file" << std::endl; ffile_info_C(&indx, self->state.handle_forcFileInfo, &self->state.numFiles, &err); if (err != 0) { - aout(self) << "Error: ffile_info_C - HRU = " << indx << - " - indxGRU = " << indx << " - refGRU = " << std::endl; + aout(self) << "Error: ffile_info_C - File_Access_Actor \n"; + std::string function = "ffile_info_C"; + self->send(self->state.parent, file_access_actor_err_v, function); self->quit(); + return; } mDecisions_C(&self->state.num_steps, &err); if (err != 0) { - aout(self) << "Error: mDecisions - FileAccess Actor " << std::endl; + aout(self) << "Error: mDecisions - FileAccess Actor \n"; + std::string function = "mDecisions_C"; + self->send(self->state.parent, file_access_actor_err_v, function); self->quit(); + return; } read_pinit_C(&err); if (err != 0) { aout(self) << "ERROR: read_pinit_C\n"; + std::string function = "read_pinit_C"; + self->send(self->state.parent, file_access_actor_err_v, function); + self->quit(); + return; } read_vegitationTables(&err); if (err != 0) { aout(self) << "ERROR: read_vegitationTables\n"; + std::string function = "read_vegitationTables"; + self->send(self->state.parent, file_access_actor_err_v, function); + self->quit(); + return; } Create_Output_File(self->state.handle_ncid, &self->state.numGRU, &self->state.startGRU, &err); if (err != 0) { aout(self) << "ERROR: Create_OutputFile\n"; + std::string function = "Create_Output_File"; + self->send(self->state.parent, file_access_actor_err_v, function); + self->quit(); + return; } diff --git a/build/source/actors/JobActor.h b/build/source/actors/JobActor.h index ac00049d41d0b0e3e9183a86f8e2ddc5ebb7fcc4..63abb30f67f79f70dfc0212574a6b4ca819b4115 100644 --- a/build/source/actors/JobActor.h +++ b/build/source/actors/JobActor.h @@ -43,16 +43,20 @@ behavior job_actor(stateful_actor<job_state>* self, int startGRU, int numGRU, aout(self) << "Job Actor Initalized \n"; return { - [=](done_file_access_actor_init) { - // Init GRU Actors and the Output Structure - self->send(self->state.file_access_actor, initalize_outputStrucure_v); - self->send(self, init_hru_v); - }, - + // ******************************************************************************************* + // *********************************** INTERFACE WITH HRU ************************************ + // ******************************************************************************************* + + /** + * + */ [=](init_hru) { initalizeGRU(self); }, + /** + * + */ [=](done_init_hru) { if (debug) { aout(self) << "Done Init\n"; @@ -68,17 +72,12 @@ behavior job_actor(stateful_actor<job_state>* self, int startGRU, int numGRU, } }, - /** - * Message from HRUActor, HRU is done the current forcing file but is not - * done its simulation and needs the next file - * indxGRU - Index into the actor array so we know which HRU this is. - * NOTE: Naming of GRU and HRU is confusing as the plan is to further seperate - * NOTE: For NA_Domain GRU is used as that is how we index the forcing file - */ [=](done_hru, int indxGRU, double totalDuration, double initDuration, double forcingDuration, double runPhysicsDuration, double writeOutputDuration) { - aout(self) << "GRU " << indxGRU << " Done\n"; + aout(self) << "GRU:" << self->state.GRUList[indxGRU - 1]->getRefGRU() + << "indxGRU = " << indxGRU << "Done \n"; + self->state.GRUList[indxGRU - 1]->doneRun(totalDuration, initDuration, forcingDuration, runPhysicsDuration, writeOutputDuration); @@ -100,6 +99,38 @@ behavior job_actor(stateful_actor<job_state>* self, int startGRU, int numGRU, } }, + + [=](run_failure, int indxGRU, int err) { + aout(self) << "GRU:" << self->state.GRUList[indxGRU - 1]->getRefGRU() + << "indxGRU = " << indxGRU << "Failed \n" + << "Will have to wait until all GRUs are done before it can be re-tried\n"; + + self->state.numGRUFailed++; + self->state.numGRUDone++; + self->state.GRUList[indxGRU - 1]->updateFailed(); + + // check if we are the last hru to complete + if (self->state.numGRUDone >= self->state.numGRU) { + restartFailures(self); + } + }, + + // ******************************************************************************************* + // ******************************* END INTERFACE WITH HRU ************************************ + // ******************************************************************************************* + + // ******************************************************************************************* + // ****************************** INTERFACE WITH FileAccessActor ***************************** + // ******************************************************************************************* + /** + * + */ + [=](done_file_access_actor_init) { + // Init GRU Actors and the Output Structure + self->send(self->state.file_access_actor, initalize_outputStructure_v); + self->send(self, init_hru_v); + }, + [=](file_access_actor_done, double readDuration, double writeDuration) { int err = 0; if (debug) { @@ -135,18 +166,15 @@ behavior job_actor(stateful_actor<job_state>* self, int startGRU, int numGRU, self->quit(); }, - [=](run_failure, int indxGRU, int err) { - aout(self) << "GRU:" << indxGRU << "Failed \n" << - "Will have to wait until all GRUs are done before it can be re-tried\n"; - self->state.numGRUFailed++; - self->state.numGRUDone++; - self->state.GRUList[indxGRU - 1]->updateFailed(); - - // check if we are the last hru to complete - if (self->state.numGRUDone >= self->state.numGRU) { - restartFailures(self); - } - }, + [=](file_access_actor_err, std::string function) { + aout(self) << "Failure in File Access Actor in function" << function << "\n"; + aout(self) << "Letting Parent Know we are quitting\n"; + self->send(self->state.parent, err_v); + self->quit(); + } + // ******************************************************************************************* + // ************************** END INTERFACE WITH FileAccessActor ***************************** + // ******************************************************************************************* }; } @@ -260,8 +288,6 @@ void restartFailures(stateful_actor<job_state>* self) { gru->updateGRU(newGRU); gru->updateCurrentAttempt(); self->send(gru->getActor(), dt_init_factor_v, gru->getDt_init()); - } else { - aout(self) << "We are done \n"; } } } diff --git a/build/source/actors/SummaActor.h b/build/source/actors/SummaActor.h index a9840effc2d660d9427c440fb291e52cf5b2c935..4bfe0e9e8e85cfd742804e9500e1fe533654259e 100644 --- a/build/source/actors/SummaActor.h +++ b/build/source/actors/SummaActor.h @@ -49,6 +49,11 @@ behavior summa_actor(stateful_actor<summa_manager>* self, int startGRU, int numG spawnJob(self); } }, + + [=](err) { + aout(self) << "Unrecoverable Error: Attempting To Fail Gracefully\n"; + self->quit(); + } }; } @@ -57,6 +62,7 @@ void spawnJob(stateful_actor<summa_manager>* self) { // Ensure we do not start a job with too many GRUs if (self->state.numGRU > self->state.maxGRUPerJob) { // spawn the job actor + aout(self) << "\n Starting Job with startGRU = " << self->state.startGRU << "\n"; self->state.currentJob = self->spawn(job_actor, self->state.startGRU, self->state.maxGRUPerJob, self->state.configPath, self->state.outputStrucSize, self); @@ -83,8 +89,8 @@ void parseSettings(stateful_actor<summa_manager>* self, std::string configPath) json SummaActorConfig = settings["SummaActor"]; // Find the desired OutputStrucSize - if (SummaActorConfig.find("OuputStrucureSize") != SummaActorConfig.end()) { - self->state.outputStrucSize = SummaActorConfig["OuputStrucureSize"]; + if (SummaActorConfig.find("OuputStructureSize") != SummaActorConfig.end()) { + self->state.outputStrucSize = SummaActorConfig["OuputStructureSize"]; } else { aout(self) << "Error Finding OutputStructureSize in JOSN - Reverting to default value\n"; self->state.outputStrucSize = 250; diff --git a/build/source/actors/messageAtoms.h b/build/source/actors/messageAtoms.h index f31f3196c9ea81f802903022df69def99d0d33e6..d845c798b85d32847a9bb074c388616737636a37 100644 --- a/build/source/actors/messageAtoms.h +++ b/build/source/actors/messageAtoms.h @@ -5,6 +5,7 @@ CAF_BEGIN_TYPE_ID_BLOCK(summa, first_custom_type_id) // Summa Actor CAF_ADD_ATOM(summa, start_summa) CAF_ADD_ATOM(summa, done_job) + CAF_ADD_ATOM(summa, err) // Job Actor CAF_ADD_ATOM(summa, done_reading_forcingFile) CAF_ADD_ATOM(summa, done_reading_first_forcing_file) @@ -17,8 +18,9 @@ CAF_BEGIN_TYPE_ID_BLOCK(summa, first_custom_type_id) CAF_ADD_ATOM(summa, run_failure) CAF_ADD_ATOM(summa, done_file_access_actor_init) CAF_ADD_ATOM(summa, file_access_actor_done) + CAF_ADD_ATOM(summa, file_access_actor_err) // FileAccess Actor - CAF_ADD_ATOM(summa, initalize_outputStrucure) + CAF_ADD_ATOM(summa, initalize_outputStructure) CAF_ADD_ATOM(summa, access_forcing) CAF_ADD_ATOM(summa, access_first_forcing_file) CAF_ADD_ATOM(summa, access_forcing_internal) diff --git a/build/source/engine/conv_funcs.f90 b/build/source/engine/conv_funcs.f90 index 2a07b7cdfe68583e2788e5f3d1a81d400f4e987c..b4bc8f27042cb414bb75555ed7683f2c5dcd690a 100755 --- a/build/source/engine/conv_funcs.f90 +++ b/build/source/engine/conv_funcs.f90 @@ -341,8 +341,7 @@ do iter=1,maxiter ! check if achieved tolerance if(abs(f0) < Xtol) exit ! check convergence - ! TODO: Changed the below to continue the hru computation - ! if(iter==maxiter)stop 'failed to converge in WETBULBTMP' +! if(iter==maxiter)stop 'failed to converge in WETBULBTMP' if(iter==maxiter)print*, 'failed to converge in WETBULBTMP' end do ! (iterating) diff --git a/build/source/engine/derivforce.f90 b/build/source/engine/derivforce.f90 index 8fb7905e3e39ef1ed464ab456616d512e195bb13..a2d75e72653c55c16646e6613a916d1ff9a625cc 100755 --- a/build/source/engine/derivforce.f90 +++ b/build/source/engine/derivforce.f90 @@ -258,7 +258,6 @@ contains ! ensure wind speed is above a prescribed minimum value if(windspd < minwind) windspd=minwind - ! compute relative humidity (-) relhum = SPHM2RELHM(spechum, airpres, airtemp) ! if relative humidity exceeds saturation, then set relative and specific humidity to saturation diff --git a/build/source/interface/file_access_actor/cppwrap_fileAccess.f90 b/build/source/interface/file_access_actor/cppwrap_fileAccess.f90 index 0c5341360de5ea9a15311bded3291f99b8f07a99..8b7225225085526ca4104d274885ca0906e89fc3 100644 --- a/build/source/interface/file_access_actor/cppwrap_fileAccess.f90 +++ b/build/source/interface/file_access_actor/cppwrap_fileAccess.f90 @@ -284,6 +284,7 @@ subroutine FileAccessActor_WriteOutput(& indxGRU, & ! index of GRU we are currently writing for indxHRU, & ! index of HRU we are currently writing for err) bind(C, name="FileAccessActor_WriteOutput") + USE def_output_module,only:def_output ! module to define model output USE globalData,only:gru_struc USE var_lookup,only:maxVarFreq ! # of available output frequencies USE writeOutput_module,only:writeBasin,writeTime,writeData diff --git a/build/source/interface/file_access_actor/initOutputStruc.f90 b/build/source/interface/file_access_actor/initOutputStruc.f90 index 29895e7c39e39d71c6fb0f9c7dd20aa6a6713438..f9d48fd3ef4018dad57353b2ac2fcba7835ab4fa 100644 --- a/build/source/interface/file_access_actor/initOutputStruc.f90 +++ b/build/source/interface/file_access_actor/initOutputStruc.f90 @@ -23,9 +23,6 @@ subroutine initalizeOutput(forcFileInfo, maxSteps, nGRU, err) USE multiconst,only:secprday ! number of seconds in a day USE data_types,only:file_info_array USE var_lookup,only:maxvarFreq ! maximum number of output files - - - implicit none type(file_info_array), pointer :: forcFileInfo diff --git a/build/source/interface/job_actor/cppwrap_job.f90 b/build/source/interface/job_actor/cppwrap_job.f90 index 84227646dd359fe478a1fec9ede7c37a29dfd15d..8ece46e9231f10f37655b5edfcffa77b11e38a7c 100644 --- a/build/source/interface/job_actor/cppwrap_job.f90 +++ b/build/source/interface/job_actor/cppwrap_job.f90 @@ -154,7 +154,7 @@ subroutine cleanUpJobActor(err) bind(C, name='cleanUpJobActor') implicit none integer(c_int), intent(inout) :: err - err = 0 + err=0 ! Deallocate Time Varaibles deallocate(startTime%var); diff --git a/config/caf-application.conf b/config/caf-application.conf deleted file mode 100644 index 83aa9138023a1e80e5120e7d8dd6c301cb3abf98..0000000000000000000000000000000000000000 --- a/config/caf-application.conf +++ /dev/null @@ -1,6 +0,0 @@ -caf { - # Parameters selecting a default scheduler. - scheduler { - max-threads = 4 - } -} \ No newline at end of file diff --git a/config/configuration.py b/config/configuration.py new file mode 100644 index 0000000000000000000000000000000000000000..063fbc6bf336fedf54f37108e938042e342190ff --- /dev/null +++ b/config/configuration.py @@ -0,0 +1,248 @@ +from distutils.command.config import config +import json +import os +import math +from os.path import exists +from datetime import date + +def actor_setting(actor_id, setting_name, setting_value): + new_dic = {actor_id: {}} + + +""" +Function to create the inital summa_actors_settings file +""" +def create_init_config(): + Settings_file = { + "JobSubmissionParams": { + "cpus-per-task": 1, + "memory": "", + "job-name": "", + "account": "", + "numHRUs": 1, + "maxNumberOfJobs": 1, + "maxGRUsPerSubmission": 1, + "executablePath": "" + }, + + "Configuration": { + "controlVersion": "", + "simStartTime": "", + "simEndTime": "", + "tmZoneInfo": "", + "settingsPath": "", + "forcingPath": "", + "outputPath": "", + "forcingFreq": "", + "forcingStart": "", + "decisionsFile": "", + "outputControlFile": "", + "globalHruParamFile": "", + "globalGruParamFile": "", + "attributeFile": "", + "trialParamFile": "", + "forcingListFile": "", + "initConditionFile": "", + "outFilePrefix": "", + "vegTableFile": "", + "soilTableFile": "", + "generalTableFile": "", + "noahmpTableFile": "" + }, + + "SummaActor": { + "OuputStructureSize": 1, + "maxGRUPerJob": 1 + }, + + "JobActor": { + "FileManagerPath": "", + "outputCSV": "", + "csvPath": "" + }, + + "HRUActor": { + "printOutput": "", + "outputFrequency": 1 + } + } + with open('Summa_Actors_Settings.json', 'w') as outfile: + json.dump(Settings_file, outfile, indent=2) + +""" +Function that creates the paths for the slurm output and the netCDF data +""" +def create_output_path(outputPath): + print("The output path exists, now seperating this run by today's date") + today = date.today() + todays_date = today.strftime("%b-%d-%Y") + outputPath += "{}/".format(todays_date) + if not exists(outputPath): + os.mkdir(outputPath) + print("Directory Created. Now Creating sub directories for SLURM Data and NetCDF data") + outputNetCDF = outputPath + "netcdf/" + outputSlurm = outputPath + "slurm/" + if not exists(outputNetCDF): + os.mkdir(outputNetCDF) + if not exists(outputSlurm): + os.mkdir(outputSlurm) + + # need to add the file name to outputSlurm + # The job will not be submitted without a file name + outputSlurm += "slurm-%A_%a.out" + + return outputNetCDF, outputSlurm + + +def create_file_manager(): + json_file = open("Summa_Actors_Settings.json") + fileManagerSettings = json.load(json_file) + json_file.close() + + # add the date for the run + outputPath = fileManagerSettings["Configuration"]["outputPath"] + if exists(outputPath): + outputNetCDF, outputSlurm = create_output_path(outputPath) + fileManagerSettings["Configuration"]["outputPath"] = outputNetCDF + else: + print("Output path does not exist, Ensure it exists before running this setup") + return -1 + + fileManager = open("fileManager.txt", "w") + for key,value in fileManagerSettings["Configuration"].items(): + fileManager.write(key + " \'{}\'\n".format(value)) + fileManager.close() + + with open("Summa_Actors_Settings.json") as settings_file: + data = json.load(settings_file) + data["JobActor"]["FileManagerPath"] = os.getcwd() + "/" + "fileManager.txt" + + with open("Summa_Actors_Settings.json", "w") as updated_settings: + json.dump(data, updated_settings, indent=2) + + + print("File Manager for this job has been created") + return outputSlurm + + +def create_caf_config(): + json_file = open("Summa_Actors_Settings.json") + SummaSettings = json.load(json_file) + json_file.close() + + numCPUs = SummaSettings["JobSubmissionParams"]["cpus-per-task"] + + + caf_config_name = "caf-application.conf" + caf_config = open(caf_config_name, "w") + caf_config.write("caf {{ \n scheduler {{\n max-threads = {}\n }}\n}}".format(numCPUs)) + caf_config.close() + + caf_config_path = os.getcwd() + caf_config_path += "/" + caf_config_path += caf_config_name + return caf_config_path + +""" +Function to create the a list of the jobs will run +This is used for submitting the array job +""" +def create_job_list(): + json_file = open("Summa_Actors_Settings.json") + SummaSettings = json.load(json_file) + json_file.close() + + numberOfTasks = SummaSettings["JobSubmissionParams"]["numHRUs"] + GRUPerJob = SummaSettings["JobSubmissionParams"]["maxGRUsPerSubmission"] + numCPUs = SummaSettings["JobSubmissionParams"]["cpus-per-task"] + print(numberOfTasks) + print(GRUPerJob) + print(numCPUs) + + # we need to get the full path of the summa binary + os.chdir("../build") + summaPath = os.getcwd() + summaPath += "/summaMain" + os.chdir("../config") + config_dir = os.getcwd() + caf_config_path = create_caf_config(numCPUs) + + + # we want to assemble the job list + job_list = open("job_list.txt", "w") + gruStart = 1 + jobCount = 0 + while gruStart < numberOfTasks: + if (numberOfTasks - gruStart < GRUPerJob): + job_list.write("{} -g {} -n {} -c {} --config-file={}\n".format(summaPath,\ + gruStart, numberOfTasks - gruStart, config_dir, caf_config_path)) + else: + job_list.write("{} -g {} -n {} -c {} --config-file={}\n".format(summaPath,\ + gruStart, GRUPerJob, config_dir, caf_config_path)) + gruStart += GRUPerJob + jobCount += 1 + + return jobCount + + +def create_sbatch_file(outputSlurm, configFile): + json_file = open("Summa_Actors_Settings.json") + SummaSettings = json.load(json_file) + json_file.close() + + numCPUs = SummaSettings["JobSubmissionParams"]["cpus-per-task"] + memory = SummaSettings["JobSubmissionParams"]["memory"] + jobName = SummaSettings["JobSubmissionParams"]["job-name"] + account = SummaSettings["JobSubmissionParams"]["account"] + numberOfTasks = SummaSettings["JobSubmissionParams"]["numHRUs"] + GRUPerJob = SummaSettings["JobSubmissionParams"]["maxGRUsPerSubmission"] + executablePath = SummaSettings["JobSubmissionParams"]["executablePath"] + + jobCount = math.ceil(numberOfTasks / GRUPerJob - 1) + + configPath = os.getcwd() + + sbatch = open("run_summa.sh", "w") + sbatch.write("#!/bin/bash\n") + sbatch.write("#SBATCH --cpus-per-task={}\n".format(numCPUs)) + sbatch.write("#SBATCH --time=24:00:00\n") + sbatch.write("#SBATCH --mem={}\n".format(memory)) + sbatch.write("#SBATCH --job-name={}\n".format(jobName)) + sbatch.write("#SBATCH --account={}\n".format(account)) + sbatch.write("#SBATCH --output={}\n".format(outputSlurm)) + sbatch.write("#SBATCH --array=0-{}\n\n".format(jobCount)) + sbatch.write("gruMax={}\n".format(numberOfTasks)) + sbatch.write("gruCount={}\n".format(GRUPerJob)) + sbatch.write("offset=$SLURM_ARRAY_TASK_ID\n") + sbatch.write("gruStart=$(( 1 + gruCount*offset ))\n") + sbatch.write("check=$(( $gruStart + $gruCount ))\n") + sbatch.write("if [ $check -gt $gruMax ]\n") + sbatch.write("then\n") + sbatch.write(" gruCount=$(( gruMax-gruStart+1 ))\n") + sbatch.write("fi\n\n") + sbatch.write("{} -g ${{gruStart}} -n ${{gruCount}} -c {} --config-file={}".format(executablePath, configPath, configFile)) + + + +""" +Funciton checks if the Summa_Actors_Settings.json file exists. +If yes: + move on +If no: + create it +""" +def init_run(): + Summa_Settings_Path = './Summa_Actors_Settings.json' + if exists('./Summa_Actors_Settings.json'): + print("File Exists, What do we do next") + outputSlurm = create_file_manager() + # jobCount = create_job_list() + configFile = create_caf_config() + create_sbatch_file(outputSlurm, configFile) + + + else: + print("File Does not Exist and we need to create it") + create_init_config() + +init_run() \ No newline at end of file diff --git a/docs/SUMMA-Actors_documentation.md b/docs/SUMMA-Actors_documentation.md new file mode 100644 index 0000000000000000000000000000000000000000..a62bb59b123f469bab4aa1dbbe5eb57e54ef4c11 --- /dev/null +++ b/docs/SUMMA-Actors_documentation.md @@ -0,0 +1,6 @@ +# SUMMA-Actors Documentation Guide + +SUMMA-Actors is a beta software that currently does not have the full capabilities of the original version of SUMMA. The most notable +feature not currently implemented is the simulation of lateral flows. SUMMA-Actors can solve for HRUs that do not have any dependencies on other HRUs. Although we are working to implement the full functionality of SUMMA into SUMMA-Actors. + +The documentation is organized as follows. We provide information for installing SUMMA-Actors on your system as well as some information for how to install SUMMA-Actors on Clusters such as the ones provided by Compute Canada. \ No newline at end of file diff --git a/docs/contact.md b/docs/contact.md new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000000000000000000000000000000000000..d910e09ec3a65150749e8c2a67461a4741b54a42 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,14 @@ +site_name: SUMMA-Actors +site_description: Structure for Unifying Multiple Modeling Alternatives - With Actors +site_author: Numerical Simulations Laboratory - University of Saskatchewan +repo_url: https://github.com/KyleKlenk/Summa-Actors +site_url: https://summa-actors.readthedocs.io +docs_dir: docs +theme: readthedocs +nav: +- Home: + - SUMMA-Actors Overview: 'index.md' + - SUMMA-Actors Documentation: 'SUMMA-Actors_documentation.md' +- Contact: + - Contact Information: "contact.md" + diff --git a/config/fileManager_example.txt b/utils/fileManager_example.txt similarity index 100% rename from config/fileManager_example.txt rename to utils/fileManager_example.txt diff --git a/utils/netcdf/ChunkingScripts/NA_Domain_Chunking.py b/utils/netcdf/ChunkingScripts/NA_Domain_Chunking.py new file mode 100644 index 0000000000000000000000000000000000000000..a3d40c5868e808706b09b6aa895ec8d317ab037d --- /dev/null +++ b/utils/netcdf/ChunkingScripts/NA_Domain_Chunking.py @@ -0,0 +1,51 @@ +import subprocess +import sys + +# nccopy -c spechum:744,1000 -c airtemp:744,1000 -c pptrate:744,1000 -c SWRadAtm:744,1000 -c LWRadAtm:744,1000 -c airpres:744,1000 +# -c windspd:744,1000 NorthAmerica_remapped_1979-01-01-00-00-00.nc NorthAmerica_remapped_1979-01-01-00-00-00-chunked.nc +def chunkCommand(timesteps, infile, outfile): + bashCommand = subprocess.run(["nccopy", "-c", "spechum:{},1000".format(timesteps), "-c", "airtemp:{},1000".format(timesteps), \ + "-c", "pptrate:{},1000".format(timesteps), "-c", "SWRadAtm:{},1000".format(timesteps), "-c", "LWRadAtm:{},1000".format(timesteps), \ + "-c", "airpres:{},1000".format(timesteps), "-c", "windspd:{},1000".format(timesteps), "{}".format(infile), "{}".format(outfile)]) + print("Exit Code = %d" % bashCommand.returncode) + + +def checkTimeSteps(year, month): + if month == 1 or month == 3 or month == 5 or month == 7 or month == 8 or \ + month == 10 or month == 12: + return str(744) + elif month == 2: + if year % 4 == 0: + return str(696) + else: + return str(672) + elif month == 4 or month == 6 or month == 9 or month == 11: + return str(720) + + +def chunkYear(year): + year = sys.argv[1] + month = 1 + year = int(year) + while month != 13: + infile = "/project/6008034/kklenk/forcing/NorthAmerica_remapped_{}-{monthS}-01-00-00-00.nc".format(str(year), monthS=(str(0)+str(month)) if month < 10 else str(month)) + outfile = "/home/kklenk/scratch/corruptionTest/NorthAmerica_remapped_{}-{monthS}-01-00-00-00-chunked.nc".format(str(year), monthS=(str(0)+str(month)) if month < 10 else str(month)) + + timesteps = checkTimeSteps(year, month) + + print(infile) + print(outfile) + + chunkCommand(timesteps, infile, outfile) + month += 1 + +def chunkSpecificFile(year, month): + infile = "/project/6008034/kklenk/forcing/NorthAmerica_remapped_{}-{monthS}-01-00-00-00.nc".format(str(year), monthS=(str(0)+str(month)) if month < 10 else str(month)) + outfile = "/home/kklenk/scratch/forcingData/NorthAmerica_remapped_{}-{monthS}-01-00-00-00-chunked.nc".format(str(year), monthS=(str(0)+str(month)) if month < 10 else str(month)) + + timesteps = checkTimeSteps(year, month) + print(infile) + print(outfile) + chunkCommand(timesteps, infile, outfile) + +chunkSpecificFile(1983, 5) \ No newline at end of file diff --git a/utils/netcdf/ChunkingScripts/NA_Domain_Chunking_Script.sh b/utils/netcdf/ChunkingScripts/NA_Domain_Chunking_Script.sh new file mode 100644 index 0000000000000000000000000000000000000000..7ac6cdbc66e44e978ee99b7dd379348556b59167 --- /dev/null +++ b/utils/netcdf/ChunkingScripts/NA_Domain_Chunking_Script.sh @@ -0,0 +1,25 @@ +#!/bin/bash +#SBATCH --cpus-per-task=1 +#SBATCH --time=1:15:00 +#SBATCH --mem=20G +#SBATCH --job-name=Forcing_dataConversion +#SBATCH --mail-user=kyle.klenk@usask.ca +#SBATCH --mail-type=ALL +#SBATCH --output=/home/kklenk/scratch/SummaActorsOutput/slurm/forcingdata-%A_%a.out +#SBATCH --account=def-spiteri_cpu + +# ---------------------------------------------------------------------------------------------- +# RUN WITH: +# sbatch --array1-[number of jobs] [script name] +# sbatch --array=0-100 run_all.sh +# ---------------------------------------------------------------------------------------------- + + + +YEAR=1979 + +offset=$SLURM_ARRAY_TASK_ID + +start=$(( YEAR + offset )) + +python3 /project/6008034/kklenk/NA_Domain_Chunking.py ${start} \ No newline at end of file diff --git a/utils/netcdf/OutputVerification/checkOutput.py b/utils/netcdf/OutputVerification/checkOutput.py new file mode 100644 index 0000000000000000000000000000000000000000..a9d4718db8e4bf1d501fe45dcef3ce76d59929c2 --- /dev/null +++ b/utils/netcdf/OutputVerification/checkOutput.py @@ -0,0 +1,21 @@ +from pathlib import Path +import xarray as xr + + +airtemp = "airtemp" + +filename = "outputChunked.txt" + +datasetPath = Path("/home/kklenk/projects/rpp-kshook/kklenk/forcingChunked/NorthAmerica_remapped_1983-05-01-00-00-00-chunked.nc") + +dataset = xr.open_dataset(datasetPath) + +data = [] + +data.append(dataset.isel(hru=492001).copy()) + +file = open(filename, "w") +for i in data[0][airtemp].values: + file.write("{}\n".format(i)) + +file.close() diff --git a/utils/netcdf/checkbit4bit.py b/utils/netcdf/OutputVerification/checkbit4bit.py similarity index 100% rename from utils/netcdf/checkbit4bit.py rename to utils/netcdf/OutputVerification/checkbit4bit.py diff --git a/utils/netcdf/compareOutput.py b/utils/netcdf/OutputVerification/compareOutput.py similarity index 100% rename from utils/netcdf/compareOutput.py rename to utils/netcdf/OutputVerification/compareOutput.py diff --git a/utils/netcdf/StatisticsScripts/resourageUsage.py b/utils/netcdf/StatisticsScripts/resourageUsage.py new file mode 100644 index 0000000000000000000000000000000000000000..9becc6dbba9c4e3e8b2e829405f3524f640dd2a3 --- /dev/null +++ b/utils/netcdf/StatisticsScripts/resourageUsage.py @@ -0,0 +1,54 @@ +import subprocess +import csv +from sys import argv + +def seffCommand(jobId, numJobs): + csvFile = open('SummaActors_jobStatistics.csv', 'w') + header = ["startHRU", "numHRU", "CPU", "CPU Efficiency", "Wall-Clock Time", "Memory Used"] + + writer = csv.writer(csvFile) + + writer.writerow(header) + + startHRU = 1 + numHRU = 1000 + for i in range(0, int(numJobs)): + print("Job", i) + rowData = [] + rowData = [numHRU * i + 1, numHRU] + cmdString = "seff {}_{}".format(jobId, i) + cmd = subprocess.Popen(cmdString, shell=True, stdout=subprocess.PIPE) + for line in cmd.stdout: + if b'Cores per node:' in line: + cores = line.decode().split(" ")[-1] + cores = cores.strip() + + if b'CPU Efficiency:' in line: + effeciency = line.decode().split(" ")[2] + effeciency = effeciency.strip() + + if b'Job Wall-clock time:' in line: + wallClock = line.decode().split(" ")[-1] + wallClock = wallClock.strip() + + if b'Memory Utilized:' in line: + memory = line.decode().split(" ")[2] + memory = memory.strip() + + rowData.append(cores) + rowData.append(effeciency) + rowData.append(wallClock) + rowData.append(memory) + writer.writerow(rowData) + + csvFile.close() + +jobId = argv[1] +print(jobId) + +numJobs = argv[2] +print(numJobs) + +seffCommand(jobId, numJobs) + + diff --git a/utils/netcdf/StatisticsScripts/summarize_summaActors.py b/utils/netcdf/StatisticsScripts/summarize_summaActors.py new file mode 100644 index 0000000000000000000000000000000000000000..4a629be59f7b60180c1121cd34b2e2777aff508c --- /dev/null +++ b/utils/netcdf/StatisticsScripts/summarize_summaActors.py @@ -0,0 +1,63 @@ +import os +import re +import sys + +summaryFile = '_log_summaryActors.txt' +ext = ".out" + +if len(sys.argv) == 1: + sys.exit('Error: no input folder specified') + +else: + + folder = sys.argv[1] + +def determine_output(folder,file): + outFile = open(folder + file, 'r') + print(outFile) + try: + lines = outFile.readlines() + except UnicodeDecodeError: + outFile.close() + outFile = open(folder + file, encoding = "ISO-8859-1") + lines = outFile.readlines() + counter = 1 + for line in reversed(lines): + if counter > 30: + return -1 + else: + if "Hours" in line: + hours = re.findall("\d+\.\d+", line) + return hours + counter += 1 + + +try: + os.remove(folder + "/" + summaryFile) +except OSError: + pass + +files = [] +for file in os.listdir(folder): + if file.endswith(ext): + files.append(file) + +files.sort() + +total_success = [] + +computation_time = [] + +with open(folder + '/' + summaryFile, "w") as sf: + sf.write('Summarizing log files in ' + folder + '\n \n') + sf.write('Log files' + '\n') + + for file in files: + value = determine_output(folder, file) + if value == -1: + sf.write("{} - Still Running or Failed\n".format(file)) + else: + sf.write("{} - Success after {} hours \n".format(file, value[0])) + + + diff --git a/utils/netcdf/mergeNetcdf.py b/utils/netcdf/etc/mergeNetcdf.py similarity index 100% rename from utils/netcdf/mergeNetcdf.py rename to utils/netcdf/etc/mergeNetcdf.py diff --git a/utils/netcdf/resourageUsage.py b/utils/netcdf/resourageUsage.py deleted file mode 100644 index 0e524de141148e3d5bcb4a0cac56eedee636216e..0000000000000000000000000000000000000000 --- a/utils/netcdf/resourageUsage.py +++ /dev/null @@ -1,7 +0,0 @@ -import subprocess - -cmd = subprocess.Popen('seff 59326149_1', shell=True, stdout=subprocess.PIPE) - -for line in cmd.stdout: - if b'CPU Utilized:' in line: - print(line) \ No newline at end of file