Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
SyntenyLink
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Thulani Hewavithana (qnm481)
SyntenyLink
Commits
e0771fe5
Commit
e0771fe5
authored
1 year ago
by
Thulani Hewavithana (qnm481)
Browse files
Options
Downloads
Patches
Plain Diff
Selection of minimum block length
parent
edb6a7e7
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
Scripts/minimum_block_length_selection.py
+124
-0
124 additions, 0 deletions
Scripts/minimum_block_length_selection.py
with
124 additions
and
0 deletions
Scripts/minimum_block_length_selection.py
0 → 100644
+
124
−
0
View file @
e0771fe5
import
pandas
as
pd
import
matplotlib.pyplot
as
plt
import
numpy
as
np
from
scipy.interpolate
import
make_interp_spline
import
sys
# Function for finding gaps in each column
def
find_gaps
(
col
,
gap_threshold
,
m
):
"""
Finds the gaps in each column of the matrix
col: column of the matrix
gap_threshold: threshold for the gap
m: number of rows in the matrix
returns: lengths of the blocks
"""
# Indices for ones in the given column
indices_one
=
np
.
where
(
col
==
1
)[
0
]
# Calculate difference between consecutive indices
d
=
np
.
diff
(
indices_one
)
# If the difference is bigger than the gap threshold, store the lengths of the blocks
initial_gaps
=
np
.
where
(
d
>
gap_threshold
)[
0
]
block_lengths
=
indices_one
[
initial_gaps
+
1
]
-
indices_one
[
initial_gaps
]
return
block_lengths
# Get the input file as an argument (collinear file)
input_file
=
sys
.
argv
[
sys
.
argv
.
index
(
'
-i
'
)
+
1
]
# Convert the collinear file to a dataframe
C_df_csv
=
pd
.
read_csv
(
input_file
,
sep
=
'
'
,
header
=
None
)
# Make a copy of the dataframe
C_df
=
C_df_csv
.
copy
()
C_df_updated
=
C_df
.
iloc
[
1
:,
1
:
-
3
]
# Convert all the 'x' to 0 and all the other entries which are not equal to 'x' to 1 omitting first column and last three columns and first row
C_df_updated
=
C_df_updated
.
replace
(
r
'
^x$
'
,
0
,
regex
=
True
)
# Convert all the other entries that start with a letter to 1
C_df_updated
=
C_df_updated
.
replace
(
r
'
^[a-zA-Z]
'
,
1
,
regex
=
True
)
# Set first row index starts from 0
C_df_updated
.
index
=
C_df_updated
.
index
-
1
# Convert the dataframe to a numpy array
C
=
C_df_updated
.
to_numpy
()
m
,
n
=
C
.
shape
print
(
m
,
n
)
# Gap_threshold is the threshold to determine if there is a gap or not
# Min_block_length is the minimum length of the signal blocks
gap_threshold
=
sys
.
argv
[
sys
.
argv
.
index
(
'
-g
'
)
+
1
]
#21
# Get all gaps in all columns
block_lengths
=
[
find_gaps
(
C
[:,
i
],
gap_threshold
,
m
)
for
i
in
range
(
n
)]
print
(
block_lengths
)
# Filter block lengths greater than 0
filtered_lengths
=
np
.
concatenate
(
block_lengths
)
filtered_lengths
=
filtered_lengths
[
filtered_lengths
>
50
]
# Calculate frequency of filtered block lengths
unique_lengths
,
length_counts
=
np
.
unique
(
filtered_lengths
,
return_counts
=
True
)
# Find the maximum, mean, and median block lengths and their frequencies
max_length
=
unique_lengths
[
np
.
argmax
(
length_counts
)]
mean_length
=
np
.
mean
(
unique_lengths
)
median_length
=
np
.
median
(
unique_lengths
)
max_frequency
=
np
.
max
(
length_counts
)
mean_frequency
=
np
.
mean
(
length_counts
)
median_frequency
=
np
.
median
(
length_counts
)
print
(
"
Block Length for Maximum Frequency:
"
,
max_length
)
print
(
"
Block Length for Mean Frequency:
"
,
mean_length
)
print
(
"
Block Length for Median Frequency:
"
,
median_length
)
print
(
"
Max Frequency:
"
,
max_frequency
)
print
(
"
Mean Frequency:
"
,
mean_frequency
)
print
(
"
Median Frequency:
"
,
median_frequency
)
# Calculate the maximum frequency block length
max_length
=
unique_lengths
[
np
.
argmax
(
length_counts
)]
max_frequency
=
np
.
max
(
length_counts
)
# Plot frequency of filtered block lengths with smooth curves
fig
,
ax
=
plt
.
subplots
(
figsize
=
(
10
,
6
))
# Find the index of the maximum length
max_length_index
=
np
.
argmax
(
length_counts
)
# Get the values 10 before and after the maximum length index
start_index
=
max
(
0
,
max_length_index
-
10
)
end_index
=
min
(
len
(
unique_lengths
),
max_length_index
+
11
)
# Extend the x-axis range
extended_start
=
max
(
0
,
start_index
-
1
)
extended_end
=
min
(
len
(
unique_lengths
),
end_index
+
1
)
x
=
unique_lengths
[
extended_start
:
extended_end
]
y
=
length_counts
[
extended_start
:
extended_end
]
# Use spline interpolation for smooth curves
x_smooth
=
np
.
linspace
(
x
.
min
(),
x
.
max
(),
300
)
y_smooth
=
make_interp_spline
(
x
,
y
)(
x_smooth
)
ax
.
plot
(
x_smooth
,
y_smooth
,
color
=
'
blue
'
,
linewidth
=
2
)
ax
.
scatter
(
x
,
y
,
color
=
'
red
'
,
marker
=
'
o
'
,
label
=
'
Data Points
'
)
# Add a red dashed line indicating the peak value
ax
.
axvline
(
max_length
,
color
=
'
red
'
,
linestyle
=
'
--
'
)
# Add text annotation for the peak value
ax
.
text
(
max_length
,
max_frequency
,
f
'
Peak:
{
max_length
}
'
,
color
=
'
red
'
,
ha
=
'
center
'
,
va
=
'
bottom
'
)
ax
.
set_xlim
(
x
.
min
(),
x
.
max
())
# Set x-axis limits to the extended range
ax
.
set_xlabel
(
'
Block Length
'
,
fontsize
=
12
)
ax
.
set_ylabel
(
'
Frequency
'
,
fontsize
=
12
)
ax
.
set_title
(
'
Frequency of Block Lengths
'
,
fontsize
=
16
)
ax
.
legend
()
ax
.
grid
(
True
,
linestyle
=
'
--
'
,
linewidth
=
0.5
,
alpha
=
0.7
)
plt
.
show
()
# Save plot as figure
fig
.
savefig
(
'
block_length_frequency.png
'
,
dpi
=
300
,
bbox_inches
=
'
tight
'
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment