Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
SyntenyLink
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Thulani Hewavithana (qnm481)
SyntenyLink
Commits
c5c0de98
Commit
c5c0de98
authored
1 year ago
by
Thulani Hewavithana (qnm481)
Browse files
Options
Downloads
Patches
Plain Diff
Main file
parent
931bee9a
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
Scripts/main_script.py
+223
-0
223 additions, 0 deletions
Scripts/main_script.py
with
223 additions
and
0 deletions
Scripts/main_script.py
0 → 100644
+
223
−
0
View file @
c5c0de98
import
numpy
as
np
import
pandas
as
pd
import
re
import
warnings
import
sys
import
pickle
import
csv
import
os
import
SyntenyLink_mbp
as
mbp
import
SyntenyLink_wg
as
wg
import
SyntenyLink_sb
as
sb
import
SyntenyLink_mn
as
mn
import
SyntenyLink_acc
as
acc
def
main
():
#get the input file as an argument (collinear file)
input_file
=
sys
.
argv
[
sys
.
argv
.
index
(
'
-i
'
)
+
1
]
#take gap_threshold and min_block_length which are numbers as arguments
gap_threshold
=
int
(
sys
.
argv
[
sys
.
argv
.
index
(
'
-g
'
)
+
1
])
min_block_length
=
int
(
sys
.
argv
[
sys
.
argv
.
index
(
'
-m
'
)
+
1
])
#Get the number of subgenomes as an argument
n_subgenomes
=
int
(
sys
.
argv
[
sys
.
argv
.
index
(
'
-n
'
)
+
1
])
#convert the collinear file to a dataframe
C_df_csv
=
pd
.
read_csv
(
input_file
,
sep
=
'
\t
'
,
header
=
None
)
#make a copy of the dataframe
C_df
=
C_df_csv
.
copy
()
C_df_updated
=
C_df
.
iloc
[
1
:,
1
:
-
3
]
#convert all the 'x' to 0 and all the other entries which are not equal to 'x' to 1 omitting first column and last three columns and first row
C_df_updated
=
C_df_updated
.
replace
(
r
'
^x$
'
,
0
,
regex
=
True
)
#convert all the other entries starts with a letter to 1
C_df_updated
=
C_df_updated
.
replace
(
r
'
^[a-zA-Z]
'
,
1
,
regex
=
True
)
#set first row index starts from 0
C_df_updated
.
index
=
C_df_updated
.
index
-
1
C_df_updated_copy
=
C_df
.
iloc
[
1
:,
1
:
-
3
]
C_df_updated_copy
.
columns
=
C_df
.
iloc
[
0
,
1
:
-
3
]
C_df_updated_copy
.
index
=
C_df_updated_copy
.
index
-
1
#convert the dataframe to a numpy array
C
=
C_df_updated
.
to_numpy
()
m
,
n
=
C
.
shape
print
(
m
,
n
)
break_point_indices
=
mbp
.
gap_calculation
(
C
,
gap_threshold
,
min_block_length
,
n
,
m
)
densities
=
mbp
.
get_densities
(
C
,
break_point_indices
,
n
,
m
)
mbp
.
create_excel_sheet
(
C_df
,
break_point_indices
,
densities
,
C_df_updated_copy
)
df_temp
=
mbp
.
get_subgenomes
(
"
Super_synteny_block_output.xlsx
"
,
n_subgenomes
)
df
=
mbp
.
assign_subgenomes
(
df_temp
,
"
Super_synteny_block_output.xlsx
"
,
n_subgenomes
)
#Saving the file
df
.
to_excel
(
"
Super_synteny_bl_sub_placement_density.xlsx
"
)
# Get the column names from df_subgenome_density that start with N followed by a number
column_names
=
df
.
filter
(
regex
=
r
'
^N\d+
'
).
columns
.
tolist
()
# Replace the columns starting from the second column until the end of C_df_updated with the selected column names
df_synteny
=
C_df_csv
.
iloc
[
1
:,
1
:
-
3
].
rename
(
columns
=
{
C_df_csv
.
iloc
[
1
:,
1
:
-
3
].
columns
[
i
-
1
]:
column_names
[
i
-
1
]
for
i
in
range
(
1
,
len
(
column_names
)
+
1
)})
#append the first column of C_df to the first column of df_synteny
df_synteny
.
insert
(
0
,
"
locus_id
"
,
C_df_csv
.
iloc
[
1
:,
0
])
#update first row index starting from 0
df_synteny
.
index
=
df_synteny
.
index
-
1
#Input the file for chains generated in DagChainer as a command line argument
chains_file
=
sys
.
argv
[
sys
.
argv
.
index
(
'
-c
'
)
+
1
]
#Input the file for blastn generated in DagChainer as a command line argument
blastn_file
=
sys
.
argv
[
sys
.
argv
.
index
(
'
-bl
'
)
+
1
]
num_blocks_main
=
len
(
pd
.
read_excel
(
"
Super_synteny_bl_sub_placement_density.xlsx
"
))
wg
.
node_traverse_most_weighted_path
(
n_subgenomes
,
df_synteny
,
chains_file
,
blastn_file
,
C_df_csv
,
num_blocks_main
)
print
(
"
===========================================
"
)
print
(
f
"
Number of blocks:
{
num_blocks_main
}
"
)
print
(
"
===========================================
"
)
num_blocks
=
len
(
pd
.
read_excel
(
"
Super_synteny_bl_sub_placement_density.xlsx
"
))
# Determine file names based on number of subgenomes
subgenome_density_files
=
"
Super_synteny_bl_sub_placement_density.xlsx
"
gene_matrix_output_files
=
"
Super_synteny_block_output.xlsx
"
# Define the small_BP_subi list to hold small breakpoints for each subgenome
small_BP_sub
=
[]
for
i
in
range
(
n_subgenomes
):
# Generate small breakpoints
small_BP_subi
=
sb
.
remove_invalid_entries
(
sb
.
modify_small_breakpoints
(
sb
.
merge_main_small_breakpoints
(
sb
.
update_small_breakpoints
(
sb
.
find_small_breakpoints
(
sb
.
sort_gaps
(
sb
.
merge_gaps
(
sb
.
merge_gaps
(
sb
.
get_gaps
(
gene_matrix_output_files
,
subgenome_density_files
,
C
,
n_subgenomes
,
1
)
)
)
),
subgenome_density_files
)
),
subgenome_density_files
)
)
)
small_BP_sub
.
append
(
small_BP_subi
)
sb
.
update_df_synteny
(
C_df
,
C_df_updated
,
subgenome_density_files
,
f
"
abc_synteny_chromosome_names.success.colinear
{
i
+
1
}
.xlsx
"
)
for
k
in
range
(
n_subgenomes
):
sb
.
subgenome_assignment_all_BP
(
f
"
Super_synteny_graph_nodes_sub
{
k
+
1
}
.xlsx
"
,
f
"
abc_synteny_chromosome_names.success.colinear
{
k
+
1
}
.xlsx
"
,
small_BP_sub
[
k
],
f
"
subgenome_placement_blocks.all.sub
{
k
+
1
}
.xlsx
"
)
print
(
"
=========================================
"
)
print
(
"
Number of small + main blocks:
"
,
len
(
pd
.
read_excel
(
"
subgenome_placement_blocks.all.sub1.xlsx
"
)))
print
(
"
=========================================
"
)
final_gaps_files
=
[
"
subgenome_placement_blocks.all.sub1.xlsx
"
,
"
subgenome_placement_blocks.all.sub2.xlsx
"
,
"
subgenome_placement_blocks.all.sub3.xlsx
"
]
sb
.
extract_subgenome_columns
(
final_gaps_files
,
n_subgenomes
)
# Read the data
df_subgenome_density
=
pd
.
read_excel
(
"
subgenome_placement_blocks.all.xlsx
"
)
# Create a list of subgenomes
subgenomes
=
[[]
for
_
in
range
(
n_subgenomes
)]
for
i
in
range
(
len
(
df_subgenome_density
[
"
Row start #
"
])):
for
j
in
range
(
df_subgenome_density
[
"
Row start #
"
].
values
[
i
],
df_subgenome_density
[
"
Row end #
"
].
values
[
i
]
+
1
):
for
k
in
range
(
n_subgenomes
):
subgenomes
[
k
].
append
(
df_synteny
[
df_subgenome_density
[
f
"
subgenome
{
k
+
1
}
"
].
values
[
i
]][
j
])
# Create a dataframe of subgenomes
df_subgenome
=
pd
.
DataFrame
(
subgenomes
).
transpose
()
# Change the column names
df_subgenome
.
columns
=
[
f
"
subgenome
{
i
+
1
}
"
for
i
in
range
(
n_subgenomes
)]
df_subgenome_sub1
=
pd
.
read_excel
(
"
subgenome_placement_blocks.all.sub1.xlsx
"
)
df_subgenome_sub2
=
pd
.
read_excel
(
"
subgenome_placement_blocks.all.sub2.xlsx
"
)
df_subgenome_sub3
=
pd
.
read_excel
(
"
subgenome_placement_blocks.all.sub3.xlsx
"
)
output_final_df
=
sb
.
check_subgenome
(
df_subgenome_density
,
df_subgenome_sub1
,
df_subgenome_sub2
,
df_subgenome_sub3
)
sb
.
error_check
(
output_final_df
,
df_subgenome
,
df_subgenome_sub1
,
df_subgenome_sub2
,
df_subgenome_sub3
)
window_size_sub1
=
int
(
sys
.
argv
[
sys
.
argv
.
index
(
'
-ws1
'
)
+
1
])
window_size_sub2
=
int
(
sys
.
argv
[
sys
.
argv
.
index
(
'
-ws2
'
)
+
1
])
window_size_sub3
=
int
(
sys
.
argv
[
sys
.
argv
.
index
(
'
-ws3
'
)
+
1
])
df_subgenome
=
sb
.
check_neighbourhoods
(
output_final_df
,
window_size_sub1
,
window_size_sub2
,
window_size_sub3
)
df_subgenome
.
to_excel
(
"
subgenome_placement_blocks.all.xlsx
"
)
# Read the specified range of cells into a pandas DataFrame
df
=
pd
.
read_excel
(
"
subgenome_placement_blocks.all.xlsx
"
,
header
=
None
,
usecols
=
"
E:G
"
,
skiprows
=
1
)
# Convert the DataFrame to a NumPy array
subgenomes
=
np
.
array
(
df
)
len_subgenomes
=
len
(
subgenomes
[:,
0
])
print
(
len_subgenomes
)
window_up
=
window_size_sub1
window_down
=
window_size_sub2
arg_max_count_2
=
np
.
array
([
-
1
,
-
1
,
-
1
])
for
k
in
range
(
len_subgenomes
):
if
k
<
window_up
:
start_ind_2
=
0
end_ind_2
=
k
+
window_down
elif
k
<
len_subgenomes
-
window_down
:
start_ind_2
=
k
-
window_up
end_ind_2
=
k
+
window_down
else
:
start_ind_2
=
k
-
window_up
end_ind
=
len_subgenomes
-
1
for
i
in
range
(
3
):
#find the column that has the maximum count for all_head[i]
count1_2
=
mn
.
count_occurrences
(
subgenomes
[
start_ind_2
:
end_ind_2
,
0
],
subgenomes
[
k
,
i
])
count2_2
=
mn
.
count_occurrences
(
subgenomes
[
start_ind_2
:
end_ind_2
,
1
],
subgenomes
[
k
,
i
])
count3_2
=
mn
.
count_occurrences
(
subgenomes
[
start_ind_2
:
end_ind_2
,
2
],
subgenomes
[
k
,
i
])
counts_2
=
np
.
array
([
count1_2
,
count2_2
,
count3_2
])
arg_max_count_2
[
i
]
=
np
.
argmax
(
counts_2
)
arg_max_count_2
=
np
.
array
(
arg_max_count_2
)
if
not
(
np
.
array_equal
(
arg_max_count_2
,
np
.
array
([
0
,
1
,
2
]))):
#if the entries do not belong to their higher dense columns
for
i
in
range
(
3
):
elements_2
=
np
.
where
(
arg_max_count_2
==
i
)
if
len
(
elements_2
[
0
])
==
1
:
temp
=
subgenomes
[
k
,
elements_2
[
0
][
0
]]
subgenomes
[
k
,
i
]
=
subgenomes
[
k
,
elements_2
[
0
][
0
]]
subgenomes
[
k
,
elements_2
[
0
][
0
]]
=
temp
elif
len
(
elements_2
[
0
])
>
1
:
#elements that are competing for column i, that is, there is a tie
for
j
in
range
(
len
(
elements_2
[
0
])):
#loop over the elements that are competing for column i
if
k
==
0
:
previous_2
=
1
else
:
previous_2
=
k
-
1
if
mn
.
compare_subgenomes
(
subgenomes
[
k
-
1
,
i
],
subgenomes
[
k
,
elements_2
[
0
][
j
]])
and
subgenomes
[
k
,
i
]
!=
subgenomes
[
k
,
elements_2
[
0
][
j
]]:
#if the next entry in the column i is equal to entry j
# and that entry is already not in column i
# then swap them
temp
=
subgenomes
[
k
,
i
]
subgenomes
[
k
,
i
]
=
subgenomes
[
k
,
elements_2
[
0
][
j
]]
subgenomes
[
k
,
elements_2
[
0
][
j
]]
=
temp
cols
=
pd
.
read_excel
(
"
subgenome_placement_blocks.all.xlsx
"
,
header
=
None
,
usecols
=
"
C:D
"
,
skiprows
=
1
)
df_new_2
=
pd
.
DataFrame
()
df_new_2
[
'
Row start #
'
]
=
cols
[
2
]
df_new_2
[
'
Row end #
'
]
=
cols
[
3
]
df_new_2
[
'
subgenome1
'
]
=
subgenomes
[:,
0
]
df_new_2
[
'
subgenome2
'
]
=
subgenomes
[:,
1
]
df_new_2
[
'
subgenome3
'
]
=
subgenomes
[:,
2
]
df_new_2
.
to_excel
(
"
Final_subgenome_placement_result.xlsx
"
)
#If there exist a ground truth file, then compare the results with the ground truth
# GT = sys.argv[sys.argv.index('-gt') + 1]
# acc.subgenome_overlap(GT,"Final_subgenome_placement_result.xlsx", df_synteny, 3)
if
__name__
==
'
__main__
'
:
main
()
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment