Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • cpv616/module1_2
  • nma904/module1_2
2 results
Show changes
Commits on Source (2)
...@@ -3,7 +3,7 @@ input_file: ...@@ -3,7 +3,7 @@ input_file:
leaf_genome_info: "/test2.txt" leaf_genome_info: "/test2.txt"
synteny_file_name: ".CDS-CDS.last.tdd10.cs0.filtered.dag.all.go_D20_g10_A5.aligncoords.gcoords.ks.txt" synteny_file_name: ".CDS-CDS.last.tdd10.cs0.filtered.dag.all.go_D20_g10_A5.aligncoords.gcoords.ks.txt"
synteny_file_path: "/" synteny_file_path: "/"
jar_path: "./" jar_path: "."
# output : paths to output files # output : paths to output files
output_file: output_file:
......
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# RACCROCHE - Module 1 & 2 # RACCROCHE - Module 1 & 2
This is a full workflow that shows methods regarding the construction of gene families, listing of candidate adjacencies and construction of ancestral contigs using Maximum Weight Matching. This is a full workflow that shows methods regarding the construction of gene families, listing of candidate adjacencies and construction of ancestral contigs using Maximum Weight Matching.
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# Section 1: Importing Libraries, Modules and Configuration File # Section 1: Importing Libraries, Modules and Configuration File
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# setting the Notebook Display Method # setting the Notebook Display Method
%matplotlib inline %matplotlib inline
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# install tqdm-joblib # install tqdm-joblib
%pip install tqdm-joblib %pip install tqdm-joblib
# install Bio # install Bio
%pip install Bio %pip install Bio
``` ```
%% Output %% Output
Defaulting to user installation because normal site-packages is not writeable Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: tqdm-joblib in /student/nma904/.local/lib/python3.8/site-packages (0.0.2) Requirement already satisfied: tqdm-joblib in /student/nma904/.local/lib/python3.8/site-packages (0.0.2)
Requirement already satisfied: tqdm in /usr/local/anaconda3/lib/python3.8/site-packages (from tqdm-joblib) (4.50.2) Requirement already satisfied: tqdm in /usr/local/anaconda3/lib/python3.8/site-packages (from tqdm-joblib) (4.50.2)
Note: you may need to restart the kernel to use updated packages. Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: Bio in /student/nma904/.local/lib/python3.8/site-packages (1.4.0) Requirement already satisfied: Bio in /student/nma904/.local/lib/python3.8/site-packages (1.4.0)
Requirement already satisfied: requests in /usr/local/anaconda3/lib/python3.8/site-packages (from Bio) (2.24.0) Requirement already satisfied: requests in /usr/local/anaconda3/lib/python3.8/site-packages (from Bio) (2.24.0)
Requirement already satisfied: biopython>=1.79 in /usr/local/anaconda3/lib/python3.8/site-packages (from Bio) (1.79) Requirement already satisfied: biopython>=1.79 in /usr/local/anaconda3/lib/python3.8/site-packages (from Bio) (1.79)
Requirement already satisfied: mygene in /student/nma904/.local/lib/python3.8/site-packages (from Bio) (3.2.2) Requirement already satisfied: mygene in /student/nma904/.local/lib/python3.8/site-packages (from Bio) (3.2.2)
Requirement already satisfied: tqdm in /usr/local/anaconda3/lib/python3.8/site-packages (from Bio) (4.50.2) Requirement already satisfied: tqdm in /usr/local/anaconda3/lib/python3.8/site-packages (from Bio) (4.50.2)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/anaconda3/lib/python3.8/site-packages (from requests->Bio) (2022.5.18.1) Requirement already satisfied: certifi>=2017.4.17 in /usr/local/anaconda3/lib/python3.8/site-packages (from requests->Bio) (2022.5.18.1)
Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/anaconda3/lib/python3.8/site-packages (from requests->Bio) (3.0.4) Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/anaconda3/lib/python3.8/site-packages (from requests->Bio) (3.0.4)
Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/anaconda3/lib/python3.8/site-packages (from requests->Bio) (1.25.11) Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/anaconda3/lib/python3.8/site-packages (from requests->Bio) (1.25.11)
Requirement already satisfied: idna<3,>=2.5 in /usr/local/anaconda3/lib/python3.8/site-packages (from requests->Bio) (2.10) Requirement already satisfied: idna<3,>=2.5 in /usr/local/anaconda3/lib/python3.8/site-packages (from requests->Bio) (2.10)
Requirement already satisfied: numpy in /usr/local/anaconda3/lib/python3.8/site-packages (from biopython>=1.79->Bio) (1.23.1) Requirement already satisfied: numpy in /usr/local/anaconda3/lib/python3.8/site-packages (from biopython>=1.79->Bio) (1.23.1)
Requirement already satisfied: biothings-client>=0.2.6 in /student/nma904/.local/lib/python3.8/site-packages (from mygene->Bio) (0.2.6) Requirement already satisfied: biothings-client>=0.2.6 in /student/nma904/.local/lib/python3.8/site-packages (from mygene->Bio) (0.2.6)
Note: you may need to restart the kernel to use updated packages. Note: you may need to restart the kernel to use updated packages.
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# import libraries # import libraries
import sys import sys
import yaml import yaml
import time import time
import os import os
import io import io
import pandas as pd import pandas as pd
import seaborn as sns import seaborn as sns
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from Bio import Phylo from Bio import Phylo
from tqdm import tqdm from tqdm import tqdm
import tqdm_joblib import tqdm_joblib
from joblib import Parallel, delayed from joblib import Parallel, delayed
# import modules # import modules
#? eventually the below list will be in your package #? eventually the below list will be in your package
from GeneFamily import GeneFamily from GeneFamily import GeneFamily
from Genome import Genome from Genome import Genome
from MWMInputTreeNode import MWMInputTreeNode from MWMInputTreeNode import MWMInputTreeNode
from mwmatching import * from mwmatching import *
from Contig import * from Contig import *
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Parsing the Configuration File ## Parsing the Configuration File
The configuration file consists of a list of input/output directories, input files and parameters to the pipeline. The configuration file consists of a list of input/output directories, input files and parameters to the pipeline.
Specifications for Input: Specifications for Input:
- leaf_genome_info: the input genomes with their positions on the phylogenetic tree, i.e. *Genomes.txt* <br> - leaf_genome_info: the input genomes with their positions on the phylogenetic tree, i.e. *Genomes.txt* <br>
- synteny_file_name: the post-fix of the synmap output of pairwise genome comparisons; <br> - synteny_file_name: the post-fix of the synmap output of pairwise genome comparisons; <br>
- synteny_file_path: the path to the synmap output data <br> - synteny_file_path: the path to the synmap output data <br>
- jar_path: the path to the UniMoG jar file <br> - jar_path: the path to the UniMoG jar file <br>
__Note__: The input genomes and phylogenetic relationship is in the *Genomes.txt* file. __Note__: The input genomes and phylogenetic relationship is in the *Genomes.txt* file.
It includes the unrooted phylogenetic tree in newick format along with genome structure and corresponding input files.<br> It includes the unrooted phylogenetic tree in newick format along with genome structure and corresponding input files.<br>
Sample Newick Tree for the monocot projet: (((51364, 54711), 51051), ((25734, 33018), 33908)) Sample Newick Tree for the monocot projet: (((51364, 54711), 51051), ((25734, 33018), 33908))
Specifications for Output: Specifications for Output:
- gene_list: path where the gene list output should be created <br> - gene_list: path where the gene list output should be created <br>
- gene_family: path where the gene family output should be created <br> - gene_family: path where the gene family output should be created <br>
- genomes: path where the genome string output should be created <br> - genomes: path where the genome string output should be created <br>
- mwm_input_template: path and template for the output files created from MWM Input step <br> - mwm_input_template: path and template for the output files created from MWM Input step <br>
- mwm_output_template: path and template for the output files created from MWM <br> - mwm_output_template: path and template for the output files created from MWM <br>
- contig_template: path and template for the output files created from constructing contigs <br> - contig_template: path and template for the output files created from constructing contigs <br>
- dcj_output_path: path to the output file when calculating DCJ distance between ancestors <br> - dcj_output_path: path to the output file when calculating DCJ distance between ancestors <br>
- dcj_summary_path: path to the output file containing a summary of the DCJ calculations <br><br> - dcj_summary_path: path to the output file containing a summary of the DCJ calculations <br><br>
Global parameters: Global parameters:
- min_cutoff_weight: minimum similarity cutoff weight for gene family construction - min_cutoff_weight: minimum similarity cutoff weight for gene family construction
- max_cutoff_weight: maximum similarity cutoff weight for gene family construction - max_cutoff_weight: maximum similarity cutoff weight for gene family construction
- ws: window size - ws: window size
- gn: number of leaf genomes - gn: number of leaf genomes
- gf1: maximum number of genes in a gene family - gf1: maximum number of genes in a gene family
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# reading from the configuration file # reading from the configuration file
config_file="../inputData/project-buxus/config.yaml" config_file="../inputData/project-monocots/config.yaml"
with open(config_file, 'r') as stream: with open(config_file, 'r') as stream:
parsed_config = yaml.load(stream, Loader=yaml.FullLoader) parsed_config = yaml.load(stream, Loader=yaml.FullLoader)
directory = parsed_config['output_path'] + parsed_config['project_name'] directory = parsed_config['output_path'] + parsed_config['project_name']
os.makedirs(directory, exist_ok = True) os.makedirs(directory, exist_ok = True)
print("Project Name and Input Directory:", parsed_config['input_path'] + parsed_config['project_name']) print("Project Name and Input Directory:", parsed_config['input_path'] + parsed_config['project_name'])
print("Project Name and Output Directory:", parsed_config['output_path'] + parsed_config['project_name']) print("Project Name and Output Directory:", parsed_config['output_path'] + parsed_config['project_name'])
print('''Please check required input data: print('''Please check required input data:
\t 1. Genome.txt with info about input genomes and phylogenetic tree \t 1. Genome.txt with info about input genomes and phylogenetic tree
\t 2. SynMap results between every pair of genomes \t 2. SynMap results between every pair of genomes
''') ''')
#print("Project input data directory:", parsed_config['input_file']['synteny_file_path']) #print("Project input data directory:", parsed_config['input_file']['synteny_file_path'])
#print("Input extant genomes info:", parsed_config['input_file']['leaf_genome_info']) #print("Input extant genomes info:", parsed_config['input_file']['leaf_genome_info'])
#print("The postfix of SynMap output files:", parsed_config['input_file']['synteny_file_name']) #print("The postfix of SynMap output files:", parsed_config['input_file']['synteny_file_name'])
``` ```
%% Output %% Output
Project Name and Input Directory: ../inputData/project-buxus Project Name and Input Directory: ../inputData/project-monocots
Project Name and Output Directory: ../outputData/project-buxus Project Name and Output Directory: ../outputData/project-monocots
Please check required input data: Please check required input data:
1. Genome.txt with info about input genomes and phylogenetic tree 1. Genome.txt with info about input genomes and phylogenetic tree
2. SynMap results between every pair of genomes 2. SynMap results between every pair of genomes
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# Section 2: Constructing Gene Families from syntenically validated orthologs from SynMap # Section 2: Constructing Gene Families from syntenically validated orthologs from SynMap
In order to succesfully construct gene families, we require: In order to succesfully construct gene families, we require:
- Successful parsing of the configuration file <br> - Successful parsing of the configuration file <br>
- Valid parameters in the configuration file - Valid parameters in the configuration file
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Reading in Input Genome Data and Tree Structure Data ## Reading in Input Genome Data and Tree Structure Data
Each genome is a leaf of the input phylogenetic tree. The *get_leaves_and_tree_info()* method extracts the following attribute of each genome from the input file specified by *leaf_genome_info* parameter in the configuration file. Each genome is a leaf of the input phylogenetic tree. The *get_leaves_and_tree_info()* method extracts the following attribute of each genome from the input file specified by *leaf_genome_info* parameter in the configuration file.
>1) genome ID; <br> >1) genome ID; <br>
>2) genome name; <br> >2) genome name; <br>
>3) most recent ancestor; <br> >3) most recent ancestor; <br>
>4) number of chromosomes; <br> >4) number of chromosomes; <br>
>5) file name of the genome annotations; <br> >5) file name of the genome annotations; <br>
>6) desired tree structure in Newick format. >6) desired tree structure in Newick format.
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# reading in input genome info # reading in input genome info
gene_family = GeneFamily(parsed_config) gene_family = GeneFamily(parsed_config)
all_leaves, median_structure, newick_structure = gene_family.get_leaves_and_tree_info() all_leaves, median_structure, newick_structure = gene_family.get_leaves_and_tree_info()
print("Extant genomes to be analyzed in this project: \n") print("Extant genomes to be analyzed in this project: \n")
genomes = pd.DataFrame(all_leaves) genomes = pd.DataFrame(all_leaves)
display = genomes.iloc[:, :3] display = genomes.iloc[:, :3]
display.columns = ['Genome ID', 'Genome Name', 'Ancestor'] display.columns = ['Genome ID', 'Genome Name', 'Ancestor']
display.set_index('Genome ID', inplace = True, drop = True) display.set_index('Genome ID', inplace = True, drop = True)
display display
``` ```
%% Output %% Output
[['28620', '35405', '57266,57268,54057,19990'], ['57266', '57268', '28620,35405,54057,19990'], ['28620,35405', '57266,57268', '54057,19990'], ['19990', '54057', '28620,35405,57266,57268']] [['28620', '35405', '57266,57268,54057,19990'], ['57266', '57268', '28620,35405,54057,19990'], ['28620,35405', '57266,57268', '54057,19990'], ['19990', '54057', '28620,35405,57266,57268']]
Extant genomes to be analyzed in this project: Extant genomes to be analyzed in this project:
Genome Name Ancestor Genome Name Ancestor
Genome ID Genome ID
19990 Vitis 3 19990 Vitis 3
28620 Aquilegea 1 28620 Aquilegea 1
35405 Nelumbo 1 35405 Nelumbo 1
54057 Amaranthus 3 54057 Amaranthus 3
57266 Tetracentron 2 57266 Tetracentron 2
57268 Buxus 2 57268 Buxus 2
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
tree = Phylo.read(io.StringIO(newick_structure), "newick") tree = Phylo.read(io.StringIO(newick_structure), "newick")
Phylo.draw(tree) Phylo.draw(tree)
``` ```
%% Output %% Output
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Creating gene families from syntenically validated homology generated from SynMap ## Creating gene families from syntenically validated homology generated from SynMap
Read in all genes for every genome, record gene-related information and use gf1 (from config file), cutoff weights in order to create gene families. Read in all genes for every genome, record gene-related information and use gf1 (from config file), cutoff weights in order to create gene families.
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
print(all_leaves) print(all_leaves)
# creating gene families # creating gene families
gene_family.make_gene_family(all_leaves) gene_family.make_gene_family(all_leaves)
print(len(all_leaves)) print(len(all_leaves))
``` ```
%% Output %% Output
[['19990', 'Vitis', '3', '19'], ['28620', 'Aquilegea', '1', '7'], ['35405', 'Nelumbo', '1', '8'], ['54057', 'Amaranthus', '3', '16'], ['57266', 'Tetracentron', '2', '19'], ['57268', 'Buxus', '2', '14']] [['19990', 'Vitis', '3', '19'], ['28620', 'Aquilegea', '1', '7'], ['35405', 'Nelumbo', '1', '8'], ['54057', 'Amaranthus', '3', '16'], ['57266', 'Tetracentron', '2', '19'], ['57268', 'Buxus', '2', '14']]
6 6
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Visualizing Gene Families ## Visualizing Gene Families
In the configuration file *config.yaml*, there are three parameters to restrict the size of gene families: In the configuration file *config.yaml*, there are three parameters to restrict the size of gene families:
- gf1: the maximum number of genes in a gene family - gf1: the maximum number of genes in a gene family
- gf2: the maximum number of genes from a single genome in a gene family - gf2: the maximum number of genes from a single genome in a gene family
- gf3: the minimum number of genomes in a gene family <br> - gf3: the minimum number of genomes in a gene family <br>
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
families = list(gene_family.gene_family.keys()) families = list(gene_family.gene_family.keys())
values = list(gene_family.gene_family.values()) values = list(gene_family.gene_family.values())
lengths = [len(values[i]) for i in range(0, len(values))] lengths = [len(values[i]) for i in range(0, len(values))]
print("Total number of gene familes:",len(families)) print("Total number of gene familes:",len(families))
print("The size of the largest gene family:", max(lengths)) print("The size of the largest gene family:", max(lengths))
``` ```
%% Output %% Output
Total number of gene familes: 13649 Total number of gene familes: 13649
The size of the largest gene family: 49 The size of the largest gene family: 49
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
sns.set(style="darkgrid") sns.set(style="darkgrid")
family_dict = {'label': families, 'lengths': lengths} family_dict = {'label': families, 'lengths': lengths}
pdfamily = pd.DataFrame(family_dict) pdfamily = pd.DataFrame(family_dict)
fig=sns.histplot(data = pdfamily, x = "lengths", binwidth = 3).set(title = "Overview of Gene Families") fig=sns.histplot(data = pdfamily, x = "lengths", binwidth = 3).set(title = "Overview of Gene Families")
plt.xlabel("Number of genes in a gene family") plt.xlabel("Number of genes in a gene family")
plt.ylabel("Frequency") plt.ylabel("Frequency")
plt.show(fig) plt.show(fig)
``` ```
%% Output %% Output
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# Section 3: Representing Genomes by Gene Family Labels # Section 3: Representing Genomes by Gene Family Labels
In order to succesfully separate the input data (genes) into respective chromosomes and genomes, we require: In order to succesfully separate the input data (genes) into respective chromosomes and genomes, we require:
- Successful creation of gene families - Successful creation of gene families
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Grouping Genes and their ordering by Chromosomes then Chromosomes by Genomes ## Grouping Genes and their ordering by Chromosomes then Chromosomes by Genomes
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# creating an instance of the Genome class # creating an instance of the Genome class
initialize_genome = Genome(gene_family.gene_list, all_leaves, parsed_config) initialize_genome = Genome(gene_family.gene_list, all_leaves, parsed_config)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# separating each genome's genes into chromosomes # separating each genome's genes into chromosomes
genome = initialize_genome.get_genome_in_string() genome = initialize_genome.get_genome_in_string()
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Visualizing Genes Families in Chromosomes of extant genomes ## Visualizing Genes Families in Chromosomes of extant genomes
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
genomes, chromosomes, gene_lengths = [], [], [] genomes, chromosomes, gene_lengths = [], [], []
genome_lengths = [] genome_lengths = []
num_chr = [] num_chr = []
for key, value in genome.items(): for key, value in genome.items():
current_lengths = 0 current_lengths = 0
for genome_id in all_leaves: for genome_id in all_leaves:
if int(genome_id[0]) == int(key): if int(genome_id[0]) == int(key):
genome_name = genome_id[1] genome_name = genome_id[1]
num_chromosomes = int(genome_id[3]) num_chromosomes = int(genome_id[3])
num_chr.append(int(num_chromosomes)) num_chr.append(int(num_chromosomes))
for i, chromosome in enumerate(value): for i, chromosome in enumerate(value):
genomes.append(genome_name) genomes.append(genome_name)
gene_lengths.append(len(chromosome)) gene_lengths.append(len(chromosome))
current_lengths += 1 current_lengths += 1
chromosomes.extend(range(1, num_chromosomes + 1)) chromosomes.extend(range(1, num_chromosomes + 1))
genome_lengths.append(current_lengths) genome_lengths.append(current_lengths)
#print(chromosomes) #print(chromosomes)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
length_index = 0 length_index = 0
# removing short scaffolds for visualization # removing short scaffolds for visualization
for i, lengths in enumerate(genome_lengths): for i, lengths in enumerate(genome_lengths):
left = length_index left = length_index
right = length_index + genome_lengths[i] right = length_index + genome_lengths[i]
gene_lengths[left:right] = sorted(gene_lengths[left:right], reverse = True) gene_lengths[left:right] = sorted(gene_lengths[left:right], reverse = True)
del gene_lengths[left + num_chr[i] : right] del gene_lengths[left + num_chr[i] : right]
del genomes[left + num_chr[i] : right] del genomes[left + num_chr[i] : right]
length_index += num_chr[i] length_index += num_chr[i]
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
print("Visualizing the number of gene families in each chromosome of each genome") print("Visualizing the number of gene families in each chromosome of each genome")
data = {'genome': genomes, 'chromosome number': chromosomes, 'number of genes': gene_lengths} data = {'genome': genomes, 'chromosome number': chromosomes, 'number of genes': gene_lengths}
pddata = pd.DataFrame(data) pddata = pd.DataFrame(data)
sns.set(style="darkgrid") sns.set(style="darkgrid")
g = sns.catplot(x = 'chromosome number', y = 'number of genes', col = 'genome', data = pddata, kind="bar", aspect = .7, sharex = False, dodge = False) g = sns.catplot(x = 'chromosome number', y = 'number of genes', col = 'genome', data = pddata, kind="bar", aspect = .7, sharex = False, dodge = False)
g.set_xticklabels(rotation = 30) g.set_xticklabels(rotation = 30)
``` ```
%% Output %% Output
Visualizing the number of gene families in each chromosome of each genome Visualizing the number of gene families in each chromosome of each genome
/usr/local/anaconda3/lib/python3.8/site-packages/seaborn/categorical.py:3793: UserWarning: Setting `sharex=False` with `color=None` may cause different levels of the `x` variable to share colors. This will change in a future version. /usr/local/anaconda3/lib/python3.8/site-packages/seaborn/categorical.py:3793: UserWarning: Setting `sharex=False` with `color=None` may cause different levels of the `x` variable to share colors. This will change in a future version.
warnings.warn(msg.format("sharex", "x"), UserWarning) warnings.warn(msg.format("sharex", "x"), UserWarning)
<seaborn.axisgrid.FacetGrid at 0x7f6785641ca0> <seaborn.axisgrid.FacetGrid at 0x7f6785641ca0>
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# Section 4: Extract gene proximity (or adjacencies) from each extant genome # Section 4: Extract gene proximity (or adjacencies) from each extant genome
Identify generalized gene adjacencies with a user specified window/gap size. The size parameter (*ws*) is defined in the configuration file. Identify generalized gene adjacencies with a user specified window/gap size. The size parameter (*ws*) is defined in the configuration file.
In order to succesfully prepare adjacency data for the Maximum Weight Matching algorithm, we require: In order to succesfully prepare adjacency data for the Maximum Weight Matching algorithm, we require:
- Successful separation of input data - Successful separation of input data
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Collecting Gene Adjacencies for Maximum Weight Matching ## Collecting Gene Adjacencies for Maximum Weight Matching
Adjacencies are extracted from gene orders in each extant genome. <br> Adjacencies are extracted from gene orders in each extant genome. <br>
A large number of (generalized) adjacencis is collected for each ancestor defined in *median_structure*. A large number of (generalized) adjacencis is collected for each ancestor defined in *median_structure*.
Number of edges are the same since the number of genes and window size is consistent for each ancestor. However, the weights will be different. Number of edges are the same since the number of genes and window size is consistent for each ancestor. However, the weights will be different.
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# preparing data for maximum weight matching + maximum weight matching # preparing data for maximum weight matching + maximum weight matching
window_size = parsed_config['ws'] window_size = parsed_config['ws']
min_adj_weight = parsed_config['min_mwm_weight'] min_adj_weight = parsed_config['min_mwm_weight']
num_gene_families = len(gene_family.gene_family) num_gene_families = len(gene_family.gene_family)
input_tree_node = MWMInputTreeNode(genome, all_leaves) input_tree_node = MWMInputTreeNode(genome, all_leaves)
mwmin_output_template = parsed_config['output_file']['mwm_input_template'] mwmin_output_template = parsed_config['output_file']['mwm_input_template']
num_anc = 1 num_anc = 1
mwm_inputs = [None] * len(median_structure) mwm_inputs = [None] * len(median_structure)
for i, structure in tqdm(enumerate(median_structure), total=len(mwm_inputs)): for i, structure in tqdm(enumerate(median_structure), total=len(mwm_inputs)):
outfile_mwmin = directory + mwmin_output_template + str(num_anc) + ".txt" outfile_mwmin = directory + mwmin_output_template + str(num_anc) + ".txt"
mwm_inputs[num_anc - 1] = input_tree_node.get_mwm_input(structure, num_gene_families, window_size, min_adj_weight, outfile_mwmin) mwm_inputs[num_anc - 1] = input_tree_node.get_mwm_input(structure, num_gene_families, window_size, min_adj_weight, outfile_mwmin)
print("number of edges calculated: " + str(len(mwm_inputs[num_anc - 1]))) print("number of edges calculated: " + str(len(mwm_inputs[num_anc - 1])))
num_anc += 1 num_anc += 1
``` ```
%% Output %% Output
25%|██▌ | 1/4 [00:00<00:02, 1.03it/s] 25%|██▌ | 1/4 [00:00<00:02, 1.03it/s]
number of edges calculated: 135424 number of edges calculated: 135424
50%|█████ | 2/4 [00:01<00:01, 1.03it/s] 50%|█████ | 2/4 [00:01<00:01, 1.03it/s]
number of edges calculated: 135424 number of edges calculated: 135424
75%|███████▌ | 3/4 [00:02<00:00, 1.10it/s] 75%|███████▌ | 3/4 [00:02<00:00, 1.10it/s]
number of edges calculated: 135424 number of edges calculated: 135424
100%|██████████| 4/4 [00:03<00:00, 1.08it/s] 100%|██████████| 4/4 [00:03<00:00, 1.08it/s]
number of edges calculated: 135424 number of edges calculated: 135424
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# Section 5: Maximum Weight Matching # Section 5: Maximum Weight Matching
In order to successfully complete the Maximum Weight Matching algorithm, we require: In order to successfully complete the Maximum Weight Matching algorithm, we require:
- Successful preparation of Maximum Weight Matching Input Data - Successful preparation of Maximum Weight Matching Input Data
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Maximum Weight Matching ## Maximum Weight Matching
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Produces the result of Maximum Weight Matching for each ancestor. <br> Produces the result of Maximum Weight Matching for each ancestor. <br>
This step is computationally expensive. <br> This step is computationally expensive. <br>
Uses multiprocessing to speed up the process. Uses multiprocessing to speed up the process.
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# The path of output file in general form # The path of output file in general form
mwm_output_template = parsed_config['output_file']['mwm_output_template'] mwm_output_template = parsed_config['output_file']['mwm_output_template']
def mwm_matching(mwm_input, num_anc): def mwm_matching(mwm_input, num_anc):
outfile_mwmout = directory + mwm_output_template + str(num_anc) + ".txt" outfile_mwmout = directory + mwm_output_template + str(num_anc) + ".txt"
maxWeightMatching(mwm_input, outfile_mwmout) maxWeightMatching(mwm_input, outfile_mwmout)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# inputs for multiprocessing # inputs for multiprocessing
numbers = [i for i in range(1, num_anc)] numbers = [i for i in range(1, num_anc)]
inputs = zip(mwm_inputs, numbers) inputs = zip(mwm_inputs, numbers)
start = time.time() start = time.time()
# multiprocessing to make workflow faster # multiprocessing to make workflow faster
# with Pool(num_anc - 1) as pool: # with Pool(num_anc - 1) as pool:
# for _ in tqdm(pool.istarmap(mwm_matching, inputs), total=len(mwm_inputs)): # for _ in tqdm(pool.istarmap(mwm_matching, inputs), total=len(mwm_inputs)):
# pass # pass
with tqdm_joblib.tqdm_joblib(tqdm(desc = "Maximum Weight Matching", total = len(mwm_inputs))) as progress_bar: with tqdm_joblib.tqdm_joblib(tqdm(desc = "Maximum Weight Matching", total = len(mwm_inputs))) as progress_bar:
Parallel(n_jobs=(num_anc - 1))(delayed(mwm_matching)(mwm_inputs[i], numbers[i]) for i in range (0, len(mwm_inputs))) Parallel(n_jobs=(num_anc - 1))(delayed(mwm_matching)(mwm_inputs[i], numbers[i]) for i in range (0, len(mwm_inputs)))
end = time.time() end = time.time()
``` ```
%% Output %% Output
Maximum Weight Matching: 100%|██████████| 4/4 [18:42<00:00, 280.68s/it] Maximum Weight Matching: 100%|██████████| 4/4 [18:42<00:00, 280.68s/it]
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
print("MWM solution was computed in %d minutes" % int((end - start)/60)) print("MWM solution was computed in %d minutes" % int((end - start)/60))
``` ```
%% Output %% Output
MWM solution was computed in 18 minutes MWM solution was computed in 18 minutes
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# Section 6: Constructing Ancestral Contigs # Section 6: Constructing Ancestral Contigs
In order to succesfully construct ancestral contigs, we require: In order to succesfully construct ancestral contigs, we require:
- Successful Maximum Weight Matching on the prepared input data - Successful Maximum Weight Matching on the prepared input data
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Creating Contigs ## Creating Contigs
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Produces the contigs for each ancestor. Produces the contigs for each ancestor.
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# creating contigs from the maximum weight matching output # creating contigs from the maximum weight matching output
contig_template = parsed_config['output_file']['contig_template'] contig_template = parsed_config['output_file']['contig_template']
# string to use for next step # string to use for next step
dcj_files = [] dcj_files = []
for i in tqdm(range(1, num_anc), total = len(mwm_inputs)): for i in tqdm(range(1, num_anc), total = len(mwm_inputs)):
outfile_mwmout = directory + mwm_output_template + str(i) + ".txt" outfile_mwmout = directory + mwm_output_template + str(i) + ".txt"
outfile_contig = directory + contig_template + str(i) + ".txt" outfile_contig = directory + contig_template + str(i) + ".txt"
dcj_files.append(outfile_contig) dcj_files.append(outfile_contig)
contig = Contig(outfile_mwmout, int(num_gene_families)) contig = Contig(outfile_mwmout, int(num_gene_families))
contig.get_edge() contig.get_edge()
list_telomeres = contig.find_telomeres() list_telomeres = contig.find_telomeres()
contig_list = contig.get_contigs(list_telomeres, outfile_contig, str(i)) contig_list = contig.get_contigs(list_telomeres, outfile_contig, str(i))
print(dcj_files)
``` ```
%% Output %% Output
100%|██████████| 4/4 [00:00<00:00, 8.15it/s] 100%|██████████| 4/4 [00:00<00:00, 8.15it/s]
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# Section 7: Calculating DCJ Distance Between Ancestors # Section 7: Calculating DCJ Distance Between Ancestors
In order to succesfully calculate the DCJ Distance between ancestors, we require: In order to succesfully calculate the DCJ Distance between ancestors, we require:
- Successful creation of ancestral contigs - Successful creation of ancestral contigs
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Calculating DCJ Distance ## Calculating DCJ Distance
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
jar_path = parsed_config['input_file']['jar_path'] jar_path = parsed_config['input_file']['jar_path']
dcj_output = [] dcj_output = []
# Calculate DCJ Distance # Calculate DCJ Distance
print("Starting DCJ Calculations, May Take a While") print("Starting DCJ Calculations, May Take a While")
for i in tqdm(range(1, num_anc)): for i in tqdm(range(1, num_anc)):
for j in range(i + 1, num_anc): for j in range(i + 1, num_anc):
dcj_output_path = directory + parsed_config['output_file']['dcj_output_path'] + str(i) + "_" + str(j) dcj_output_path = directory + parsed_config['output_file']['dcj_output_path'] + str(i) + "_" + str(j)
command = "java -jar " + jar_path + "/UniMoG-java11.jar " + str(dcj_files[i - 1]) + " " + str(dcj_files[j - 1]) + " " + "-m=1 -p >" + dcj_output_path command = "java -jar " + jar_path + "/UniMoG-java11.jar " + str(dcj_files[i - 1]) + " " + str(dcj_files[j - 1]) + " " + "-m=1 -p >" + dcj_output_path
dcj_output.append(dcj_output_path) dcj_output.append(dcj_output_path)
os.system(command) os.system(command)
print("Calculation Done") print("Calculation Done")
``` ```
%% Output %% Output
0%| | 0/4 [00:00<?, ?it/s] 0%| | 0/4 [00:00<?, ?it/s]
Starting DCJ Calculations, May Take a While Starting DCJ Calculations, May Take a While
100%|██████████| 4/4 [01:16<00:00, 19.20s/it] 100%|██████████| 4/4 [01:16<00:00, 19.20s/it]
Calculation Done Calculation Done
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
dcj_summary = directory + parsed_config['output_file']['dcj_summary_path'] dcj_summary = directory + parsed_config['output_file']['dcj_summary_path']
with open(dcj_summary, 'w') as dcj_summary_file: with open(dcj_summary, 'w') as dcj_summary_file:
for i in range(len(median_structure)): for i in range(len(median_structure)):
dcj_summary_file.write("Median Structure for Ancestor " + str((i + 1)) + ": " + "%s" % median_structure[i] + "\n") dcj_summary_file.write("Median Structure for Ancestor " + str((i + 1)) + ": " + "%s" % median_structure[i] + "\n")
for file in dcj_output: for file in dcj_output:
path = file path = file
with open(path, 'r') as dcj_file: with open(path, 'r') as dcj_file:
dcj_info = dcj_file.readlines() dcj_info = dcj_file.readlines()
dcj_summary_file.write(dcj_info[0]) dcj_summary_file.write(dcj_info[0])
print("Summary Done") print("Summary Done")
``` ```
%% Output %% Output
Summary Done Summary Done
......