From 983cd5c02af2976965ad2695333f701ac0e63074 Mon Sep 17 00:00:00 2001
From: kck540 <kyle.klenk@usask.ca>
Date: Thu, 6 Apr 2023 15:01:11 -0400
Subject: [PATCH] Add convience scripts

Scripts for going over data within Summa
---
 utils/StatisticsScripts/sortCSV.py            | 10 ++
 .../summarize_summaActors.py                  | 97 ++++++++++++-------
 .../StatisticsScripts/summarize_summaOrig.py  | 89 +++++++++++++++++
 .../netcdf/OutputVerification/checkOutput.py  |  8 +-
 4 files changed, 166 insertions(+), 38 deletions(-)
 create mode 100644 utils/StatisticsScripts/sortCSV.py
 create mode 100644 utils/StatisticsScripts/summarize_summaOrig.py

diff --git a/utils/StatisticsScripts/sortCSV.py b/utils/StatisticsScripts/sortCSV.py
new file mode 100644
index 0000000..61eac2b
--- /dev/null
+++ b/utils/StatisticsScripts/sortCSV.py
@@ -0,0 +1,10 @@
+import pandas as pd
+
+# Read CSV file into a DataFrame
+df = pd.read_csv('/home/kklenk/scratch/Single_CPU_TEST/non-actors/logs/_log_summaryOriginal.csv')
+
+# Sort DataFrame by column
+df = df.sort_values('start_hru')
+
+# Write sorted DataFrame back to CSV file
+df.to_csv('/home/kklenk/scratch/Single_CPU_TEST/non-actors/logs/_log_summaryOriginal_sorted.csv', index=False)
\ No newline at end of file
diff --git a/utils/StatisticsScripts/summarize_summaActors.py b/utils/StatisticsScripts/summarize_summaActors.py
index 4a629be..372b8bd 100644
--- a/utils/StatisticsScripts/summarize_summaActors.py
+++ b/utils/StatisticsScripts/summarize_summaActors.py
@@ -1,39 +1,62 @@
+# Kyle Klenk, (kyle.klenk@usask.ca)
+# This file will summarize the files that are outputed by summa
 import os
 import re
 import sys
+import csv
 
-summaryFile = '_log_summaryActors.txt'
-ext = ".out"
+def get_job_stats(folder,file):
+	outFile = open(folder + file, 'r')
+	print(outFile)
+	
+	lines = outFile.readlines()
 
-if len(sys.argv) == 1:
-	sys.exit('Error: no input folder specified')
 
-else:
+	start_hru = int(''.join(filter(str.isdigit, file)))
+	
+	
+	max_lines_to_read = 40
+	lines_read_counter = 1
+	max_items_looking_for = 3
+	items_found = 0
+	row_data = [start_hru, -99, -99, -99]
+	
+	for line in reversed(lines):
+		if lines_read_counter > max_lines_to_read:
+			return row_data
+		
+		elif items_found == max_items_looking_for:
+			return row_data
+		
+		elif "Hours" in line:
+			hours = re.findall("\d+\.\d+", line)
+			row_data[1] = hours[0]
+			lines_read_counter += 1
 
-	folder = sys.argv[1]
+		elif "Total Read Duration" in line:
+			seconds = re.findall("\d+\.\d+", line)
+			row_data[2] = seconds[0]
+			lines_read_counter += 1
+		
+		elif "Total Write Duration" in line:
+			seconds = re.findall("\d+\.\d+", line)
+			row_data[3] = seconds[0]
+			lines_read_counter += 1
 
-def determine_output(folder,file):
-	outFile = open(folder + file, 'r')
-	print(outFile)
-	try:
-		lines = outFile.readlines()
-	except UnicodeDecodeError:
-		outFile.close()
-		outFile = open(folder + file, encoding = "ISO-8859-1")
-		lines = outFile.readlines()
-	counter = 1
-	for line in reversed(lines):
-		if counter > 30:
-			return -1
 		else:
-			if "Hours" in line:
-				hours = re.findall("\d+\.\d+", line)
-				return hours
-			counter += 1
-		
+			lines_read_counter += 1
+
+output_file = '_log_summaryActors.csv'
+ext = ".txt"
+
+# Check command line args
+if len(sys.argv) == 1:
+	sys.exit('Error: no input folder specified')
+else:
+	folder = sys.argv[1]
 
 try:
-	os.remove(folder + "/" + summaryFile)
+	os.remove(folder + "/" + output_file)
 except OSError:
 	pass
 
@@ -44,20 +67,26 @@ for file in os.listdir(folder):
 
 files.sort()
 
+
 total_success = []
 
 computation_time = []
 
-with open(folder + '/' + summaryFile, "w") as sf:
-	sf.write('Summarizing log files in ' + folder + '\n \n')
-	sf.write('Log files' + '\n')
+csv_file = open(folder + '/' + output_file, "w")
+writer = csv.writer(csv_file)
+csv_header = ["start_hru", "job_duration", "read_duration", "write_duration"]
+writer.writerow(csv_header)
+
+for file in files:
+	row_data = get_job_stats(folder, file)
+	if row_data is None:
+		start_hru = int(''.join(filter(str.isdigit, file)))
+		row_data = [start_hru, -99, -99, -99]
+
+	writer.writerow(row_data)
+
+csv_file.close()
 
-	for file in files:
-		value = determine_output(folder, file)
-		if value == -1:
-			sf.write("{} - Still Running or Failed\n".format(file))
-		else:
-			sf.write("{} - Success after {} hours \n".format(file, value[0]))
 
 
 
diff --git a/utils/StatisticsScripts/summarize_summaOrig.py b/utils/StatisticsScripts/summarize_summaOrig.py
new file mode 100644
index 0000000..465de9d
--- /dev/null
+++ b/utils/StatisticsScripts/summarize_summaOrig.py
@@ -0,0 +1,89 @@
+# Kyle Klenk, (kyle.klenk@usask.ca)
+# This file will summarize the files that are outputed by summa
+import os
+import re
+import sys
+import csv
+
+
+def get_job_stats(folder, file):
+    outFile = open(folder + file, 'r')
+    print(outFile)
+    
+    lines = outFile.readlines()
+    
+    start_hru = int(''.join(filter(str.isdigit, file)))
+    
+    max_lines_to_read = 40
+    lines_read_counter = 1
+    max_items_looking_for = 3
+    items_found = 0
+    row_data = [start_hru, -99, -99, -99]
+
+    for line in reversed(lines):
+        if lines_read_counter > max_lines_to_read:
+            return row_data
+        
+        elif items_found == max_items_looking_for:
+            return row_data
+        
+        elif "FATAL ERROR" in line:
+            return row_data
+        
+        elif "h" in line and "or" in line:
+            hours = re.findall("\d+\.\d+", line)
+            row_data[1] = hours[0]
+            lines_read_counter += 1
+            items_found += 1
+
+        elif "elapsed read" in line:
+            seconds = re.findall("\d+\.\d+", line)
+            row_data[2] = seconds[0]
+            lines_read_counter += 1
+            items_found += 1
+
+        elif "elapsed write" in line:
+            seconds = re.findall("\d+\.\d+", line)
+            row_data[3] = seconds[0]
+            lines_read_counter += 1
+            items_found += 1
+            
+        else:
+            lines_read_counter += 1
+
+output_file = '_log_summaryOriginal.csv'
+ext = ".txt"
+
+# Check command line args
+if len(sys.argv) == 1:
+	sys.exit('Error: no input folder specified')
+else:
+	folder = sys.argv[1]
+
+try:
+	os.remove(folder + "/" + output_file)
+except OSError:
+	pass
+
+files = []
+for file in os.listdir(folder):
+	if file.endswith(ext):
+		files.append(file)
+
+files.sort()
+
+csv_file = open(folder + '/' + output_file, "w")
+writer = csv.writer(csv_file)
+csv_header = ["start_hru", "job_duration", "read_duration", "write_duration"]
+writer.writerow(csv_header)
+
+for file in files:
+	row_data = get_job_stats(folder, file)
+	if row_data is None:
+		start_hru = int(''.join(filter(str.isdigit, file)))
+		row_data = [start_hru, -99, -99, -99]
+
+	writer.writerow(row_data)
+
+csv_file.close()
+
diff --git a/utils/netcdf/OutputVerification/checkOutput.py b/utils/netcdf/OutputVerification/checkOutput.py
index 675ee57..104f83f 100644
--- a/utils/netcdf/OutputVerification/checkOutput.py
+++ b/utils/netcdf/OutputVerification/checkOutput.py
@@ -80,15 +80,15 @@ def get_output_vars(model_output_file):
 
 
 
-num_hru = 1
+num_hru = 125
 print("Checking output for", num_hru, "HRUs")
-dataset_1 = "/home/kklenk/scratch/Kinsol/netcdf/SummaActorsGRU1-1_timestep.nc"
-dataset_2 = "/home/kklenk/scratch/Kinsol/netcdf/SummaActors_kinsolGRU1-1_timestep.nc"
+dataset_1 = "/home/kklenk/scratch/Single_CPU_TEST/actors/netcdf/SummaActorsGRU6126-125_day.nc"
+dataset_2 = "/home/kklenk/scratch/Single_CPU_TEST/non-actors/netcdf/SummaOriginal_G006126-006250_day.nc"
 
 # dataset_1 = "/scratch/kck540/Summa_Sundials/non-actors/SummaOriginal-BE_G000001-000002_timestep.nc"
 # dataset_2 = "/scratch/kck540/Summa_Sundials/actors/SummaActors-BEGRU1-2_timestep.nc"
 
-model_output_file = "/home/kklenk/projects/rpp-kshook/kklenk/settings/SummaActorsSettings/outputControl.txt"
+model_output_file = "/home/kklenk/scratch/Single_CPU_TEST/settings/outputControl.txt"
 
 output_vars = get_output_vars(model_output_file)
 verify_data(dataset_1, dataset_2, num_hru, output_vars)
-- 
GitLab