Selection of minimum block length

e0771fe5 · Thulani Hewavithana (qnm481) · edb6a7e7 · e0771fe5
Commit e0771fe5 authored 1 year ago by Thulani Hewavithana (qnm481)
--- a/Scripts/minimum_block_length_selection.py
+++ b/Scripts/minimum_block_length_selection.py
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+from scipy.interpolate import make_interp_spline
+import sys
+
+# Function for finding gaps in each column
+def find_gaps(col, gap_threshold, m):
+    """
+    Finds the gaps in each column of the matrix
+    col: column of the matrix
+    gap_threshold: threshold for the gap
+    m: number of rows in the matrix
+
+    returns: lengths of the blocks
+    """
+    # Indices for ones in the given column
+    indices_one = np.where(col == 1)[0]
+    # Calculate difference between consecutive indices
+    d = np.diff(indices_one)
+
+    # If the difference is bigger than the gap threshold, store the lengths of the blocks
+    initial_gaps = np.where(d > gap_threshold)[0]
+    block_lengths = indices_one[initial_gaps + 1] - indices_one[initial_gaps]
+
+    return block_lengths
+
+# Get the input file as an argument (collinear file)
+input_file = sys.argv[sys.argv.index('-i') + 1]
+
+# Convert the collinear file to a dataframe
+C_df_csv = pd.read_csv(input_file, sep=' ', header=None)
+# Make a copy of the dataframe
+C_df = C_df_csv.copy()
+C_df_updated = C_df.iloc[1:, 1:-3]
+# Convert all the 'x' to 0 and all the other entries which are not equal to 'x' to 1 omitting first column and last three columns and first row
+C_df_updated = C_df_updated.replace(r'^x$', 0, regex=True)
+# Convert all the other entries that start with a letter to 1
+C_df_updated = C_df_updated.replace(r'^[a-zA-Z]', 1, regex=True)
+# Set first row index starts from 0
+C_df_updated.index = C_df_updated.index - 1
+
+# Convert the dataframe to a numpy array
+C = C_df_updated.to_numpy()
+m, n = C.shape
+print(m, n)
+
+# Gap_threshold is the threshold to determine if there is a gap or not
+# Min_block_length is the minimum length of the signal blocks
+gap_threshold = sys.argv[sys.argv.index('-g') + 1]
+#21
+
+# Get all gaps in all columns
+block_lengths = [find_gaps(C[:, i], gap_threshold, m) for i in range(n)]
+print(block_lengths)
+
+# Filter block lengths greater than 0
+filtered_lengths = np.concatenate(block_lengths)
+filtered_lengths = filtered_lengths[filtered_lengths > 50]
+
+# Calculate frequency of filtered block lengths
+unique_lengths, length_counts = np.unique(filtered_lengths, return_counts=True)
+
+# Find the maximum, mean, and median block lengths and their frequencies
+max_length = unique_lengths[np.argmax(length_counts)]
+mean_length = np.mean(unique_lengths)
+median_length = np.median(unique_lengths)
+
+max_frequency = np.max(length_counts)
+mean_frequency = np.mean(length_counts)
+median_frequency = np.median(length_counts)
+
+print("Block Length for Maximum Frequency:", max_length)
+print("Block Length for Mean Frequency:", mean_length)
+print("Block Length for Median Frequency:", median_length)
+print("Max Frequency:", max_frequency)
+print("Mean Frequency:", mean_frequency)
+print("Median Frequency:", median_frequency)
+
+# Calculate the maximum frequency block length
+max_length = unique_lengths[np.argmax(length_counts)]
+max_frequency = np.max(length_counts)
+
+# Plot frequency of filtered block lengths with smooth curves
+fig, ax = plt.subplots(figsize=(10, 6))
+
+# Find the index of the maximum length
+max_length_index = np.argmax(length_counts)
+
+# Get the values 10 before and after the maximum length index
+start_index = max(0, max_length_index - 10)
+end_index = min(len(unique_lengths), max_length_index + 11)
+
+# Extend the x-axis range
+extended_start = max(0, start_index - 1)
+extended_end = min(len(unique_lengths), end_index + 1)
+
+x = unique_lengths[extended_start:extended_end]
+y = length_counts[extended_start:extended_end]
+
+# Use spline interpolation for smooth curves
+x_smooth = np.linspace(x.min(), x.max(), 300)
+y_smooth = make_interp_spline(x, y)(x_smooth)
+
+ax.plot(x_smooth, y_smooth, color='blue', linewidth=2)
+ax.scatter(x, y, color='red', marker='o', label='Data Points')
+
+# Add a red dashed line indicating the peak value
+ax.axvline(max_length, color='red', linestyle='--')
+
+# Add text annotation for the peak value
+ax.text(max_length, max_frequency, f'Peak: {max_length}', color='red', ha='center', va='bottom')
+
+ax.set_xlim(x.min(), x.max())  # Set x-axis limits to the extended range
+ax.set_xlabel('Block Length', fontsize=12)
+ax.set_ylabel('Frequency', fontsize=12)
+ax.set_title('Frequency of Block Lengths', fontsize=16)
+ax.legend()
+ax.grid(True, linestyle='--', linewidth=0.5, alpha=0.7)
+
+plt.show()
+
+# Save plot as figure
+fig.savefig('block_length_frequency.png', dpi=300, bbox_inches='tight')