Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • cpv616/module1_2
  • nma904/module1_2
2 results
Show changes
Showing
with 294 additions and 118 deletions
chr size chr size
1 19561049 quisa.S10.gnm1.Chr01 19561049
2 20977047 quisa.S10.gnm1.Chr02 20977047
3 37318896 quisa.S10.gnm1.Chr03 37318896
4 34047100 quisa.S10.gnm1.Chr04 34047100
5 21375137 quisa.S10.gnm1.Chr05 21375137
6 28584509 quisa.S10.gnm1.Chr06 28584509
7 26440503 quisa.S10.gnm1.Chr07 26440503
8 28358298 quisa.S10.gnm1.Chr08 28358298
9 26458845 quisa.S10.gnm1.Chr09 26458845
10 21706344 quisa.S10.gnm1.Chr10 21706344
11 21697428 quisa.S10.gnm1.Chr11 21697428
12 21470783 quisa.S10.gnm1.Chr12 21470783
13 18106809 quisa.S10.gnm1.Chr13 18106809
14 20802709 quisa.S10.gnm1.Chr14 20802709
\ No newline at end of file \ No newline at end of file
chr size chr size
1 57468748 cerca.ISC453364.gnm3.Chr01 57468748
2 51272464 cerca.ISC453364.gnm3.Chr02 51272464
3 49996358 cerca.ISC453364.gnm3.Chr03 49996358
4 48286772 cerca.ISC453364.gnm3.Chr04 48286772
5 45669363 cerca.ISC453364.gnm3.Chr05 45669363
6 45655421 cerca.ISC453364.gnm3.Chr06 45655421
7 41929180 cerca.ISC453364.gnm3.Chr07 41929180
\ No newline at end of file \ No newline at end of file
chr size chr size
1 136071090 singl.CAF01.gnm1.Chr01 136071090
2 116487353 singl.CAF01.gnm1.Chr02 116487353
3 109186524 singl.CAF01.gnm1.Chr03 109186524
4 91914064 singl.CAF01.gnm1.Chr04 91914064
5 88658134 singl.CAF01.gnm1.Chr05 88658134
6 84872623 singl.CAF01.gnm1.Chr06 84872623
7 83373435 singl.CAF01.gnm1.Chr07 83373435
8 81990953 singl.CAF01.gnm1.Chr08 81990953
9 76993406 singl.CAF01.gnm1.Chr09 76993406
10 75912673 singl.CAF01.gnm1.Chr10 75912673
11 75144154 singl.CAF01.gnm1.Chr11 75144154
12 71117942 singl.CAF01.gnm1.Chr12 71117942
\ No newline at end of file \ No newline at end of file
chr size chr size
1 32816166 sento.Myeongyun.gnm1.Chr01 32816166
2 42009719 sento.Myeongyun.gnm1.Chr02 42009719
3 37860065 sento.Myeongyun.gnm1.Chr03 37860065
4 30689712 sento.Myeongyun.gnm1.Chr04 30689712
5 52777034 sento.Myeongyun.gnm1.Chr05 52777034
6 46512068 sento.Myeongyun.gnm1.Chr06 46512068
7 30975534 sento.Myeongyun.gnm1.Chr07 30975534
8 49705205 sento.Myeongyun.gnm1.Chr08 49705205
9 35860388 sento.Myeongyun.gnm1.Chr09 35860388
10 41499577 sento.Myeongyun.gnm1.Chr10 41499577
11 30871617 sento.Myeongyun.gnm1.Chr11 30871617
12 29799933 sento.Myeongyun.gnm1.Chr12 29799933
13 41270226 sento.Myeongyun.gnm1.Chr13 41270226
\ No newline at end of file \ No newline at end of file
chr size chr size
1 47851208 prupe.Lovell.gnm2.Pp01 47851208
2 30405870 prupe.Lovell.gnm2.Pp02 30405870
3 27368013 prupe.Lovell.gnm2.Pp03 27368013
4 25843236 prupe.Lovell.gnm2.Pp04 25843236
5 18496696 prupe.Lovell.gnm2.Pp05 18496696
6 30767194 prupe.Lovell.gnm2.Pp06 30767194
7 22388614 prupe.Lovell.gnm2.Pp07 22388614
8 22573980 prupe.Lovell.gnm2.Pp08 22573980
\ No newline at end of file \ No newline at end of file
chr size chr size
1 51433939 phavu.G19833.gnm2.Chr01 51433939
2 49670989 phavu.G19833.gnm2.Chr02 49670989
3 53438756 phavu.G19833.gnm2.Chr03 53438756
4 48048378 phavu.G19833.gnm2.Chr04 48048378
5 40923498 phavu.G19833.gnm2.Chr05 40923498
6 31236378 phavu.G19833.gnm2.Chr06 31236378
7 40041001 phavu.G19833.gnm2.Chr07 40041001
8 63048260 phavu.G19833.gnm2.Chr08 63048260
9 38250102 phavu.G19833.gnm2.Chr09 38250102
10 44302882 phavu.G19833.gnm2.Chr10 44302882
11 53580169 phavu.G19833.gnm2.Chr11 53580169
\ No newline at end of file \ No newline at end of file
chr size chr size
1 52991155 medtr.A17_HM341.gnm4.chr1 52991155
2 45729672 medtr.A17_HM341.gnm4.chr2 45729672
3 55515152 medtr.A17_HM341.gnm4.chr3 55515152
4 56582383 medtr.A17_HM341.gnm4.chr4 56582383
5 43630510 medtr.A17_HM341.gnm4.chr5 43630510
6 35275713 medtr.A17_HM341.gnm4.chr6 35275713
7 49172423 medtr.A17_HM341.gnm4.chr7 49172423
8 45569985 medtr.A17_HM341.gnm4.chr8 45569985
\ No newline at end of file \ No newline at end of file
chr size chr size
1 23037639 vitvi.PN40024.gnm2.chr1 23037639
2 18779844 vitvi.PN40024.gnm2.chr2 18779844
3 19341862 vitvi.PN40024.gnm2.chr3 19341862
4 23867706 vitvi.PN40024.gnm2.chr4 23867706
5 25021643 vitvi.PN40024.gnm2.chr5 25021643
6 21508407 vitvi.PN40024.gnm2.chr6 21508407
7 21026613 vitvi.PN40024.gnm2.chr7 21026613
8 22385789 vitvi.PN40024.gnm2.chr8 22385789
9 23006712 vitvi.PN40024.gnm2.chr9 23006712
10 18140952 vitvi.PN40024.gnm2.chr10 18140952
11 19818926 vitvi.PN40024.gnm2.chr11 19818926
12 22702307 vitvi.PN40024.gnm2.chr12 22702307
13 24396255 vitvi.PN40024.gnm2.chr13 24396255
14 30274277 vitvi.PN40024.gnm2.chr14 30274277
15 20304914 vitvi.PN40024.gnm2.chr15 20304914
16 22053297 vitvi.PN40024.gnm2.chr16 22053297
17 17126926 vitvi.PN40024.gnm2.chr17 17126926
18 29360087 vitvi.PN40024.gnm2.chr18 29360087
19 24021853 vitvi.PN40024.gnm2.chr19 24021853
\ No newline at end of file \ No newline at end of file
# project name # project name
project_name: "project-monocots" project_name: "project-buxus"
# input : path to genome information file # input : path to genome information file
input_file: input_file:
...@@ -52,21 +52,21 @@ gf3: 1 ...@@ -52,21 +52,21 @@ gf3: 1
K: 7 K: 7
## set thresholds ## set thresholds
ctgLen: 1 ## minimum length in each contig #ctgLen: 1 ## minimum length in each contig
nctg: 250 ## only include first nctg contigs #nctg: 250 ## only include first nctg contigs
## threshold of distance (1MB) between two genes in the same contig on the same chromosome ## threshold of distance (1MB) between two genes in the same contig on the same chromosome
## if the distance between two genes is smaller than the threshold, merge them into one block ## if the distance between two genes is smaller than the threshold, merge them into one block
DIS.threshold: 1000000 #DIS.threshold: 1000000
## threshold to visualize the gene/block (150 kb) ## threshold to visualize the gene/block (150 kb)
## if a block is less than this length, do not show it --> keep the plot clean ## if a block is less than this length, do not show it --> keep the plot clean
blockLEN.threshold: 150000 #blockLEN.threshold: 150000
## threshold for coocurrence analysis ## threshold for coocurrence analysis
## only count blocks that are longer than 15KB ## only count blocks that are longer than 15KB
lenBLK.threshold: 15000 #lenBLK.threshold: 15000
......
chr size
LG01 24880688
LG02 17334668
LG03 16781886
LG04 15584765
LG05 15024279
LG06 14747297
LG07 14728688
LG08 13970067
LG09 13841557
LG10 13112115
LG11 13096665
LG12 12612916
LG13 11759267
LG14 11705491
LG15 11365478
LG16 11174208
LG17 11156084
LG18 10911977
LG19 10815949
LG20 10792357
LG21 10645290
LG22 10398566
LG23 7979763
LG24 7684463
LG25 3739203
chr size
REF_ELAGV01 68435666
REF_ELAGV02 65558141
REF_ELAGV03 60060032
REF_ELAGV04 57251647
REF_ELAGV05 51955539
REF_ELAGV06 44357769
REF_ELAGV07 43454766
REF_ELAGV08 40195399
REF_ELAGV09 38056896
REF_ELAGV10 31890735
REF_ELAGV11 30068910
REF_ELAGV12 28800575
REF_ELAGV13 27817470
REF_ELAGV14 24379743
REF_ELAGV15 24314465
REF_ELAGV16 21371083
chr size
AsparagusV1_01 132393784
AsparagusV1_02 84732013
AsparagusV1_03 113259003
AsparagusV1_04 146560859
AsparagusV1_05 131469567
AsparagusV1_06 78779235
AsparagusV1_07 151357931
AsparagusV1_08 131339754
AsparagusV1_09 68543460
AsparagusV1_10 73452336
chr size
1 31431123
2 33969142
3 19547260
4 27749968
5 32871075
6 32866201
7 18053115
8 28021425
9 23324680
10 17904088
11 17195437
12 25272979
13 29719729
14 15713101
15 10474557
16 23371965
17 20585887
18 22795539
19 9927364
20 10677198
21 5203141
chr size
1 11560055
2 8180930
3 8870115
4 9537711
5 7072156
6 8333076
7 7949387
8 8482602
9 7754086
10 7963964
11 6749155
12 5993893
13 6317482
14 5575328
15 4557367
16 5820572
17 4601490
18 5315945
19 3959434
20 3997407
chr size 1 37743429 2 34360390 3 33772675 4 31994915 5 31167455 6 30970160 7 30771956 8 28400560 9 28235908 10 25518903 11 25169648 12 24790462
\ No newline at end of file
...@@ -81,7 +81,7 @@ class Genome: ...@@ -81,7 +81,7 @@ class Genome:
f.write(chromosome + " ") f.write(chromosome + " ")
for gene in genes: for gene in genes:
f.write("%s " % genes) f.write("%s " % gene)
f.write("\n") f.write("\n")
......
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# RACCROCHE - Module 1 & 2 # RACCROCHE - Module 1 & 2
This is a full workflow that shows methods regarding the construction of gene families, listing of candidate adjacencies and construction of ancestral contigs using Maximum Weight Matching. This is a full workflow that shows methods regarding the construction of gene families, listing of candidate adjacencies and construction of ancestral contigs using Maximum Weight Matching.
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# Section 1: Importing Libraries, Modules and Configuration File # Section 1: Importing Libraries, Modules and Configuration File
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# setting the Notebook Display Method # setting the Notebook Display Method
%matplotlib inline %matplotlib inline
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# install tqdm-joblib # install tqdm-joblib
%pip install tqdm-joblib %pip install tqdm-joblib
# install Bio # install Bio
%pip install Bio %pip install Bio
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# import libraries # import libraries
import sys import sys
import yaml import yaml
import time import time
import os import os
import io import io
import pandas as pd import pandas as pd
import seaborn as sns import seaborn as sns
import matplotlib import matplotlib
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from Bio import Phylo from Bio import Phylo
from scipy.spatial.distance import squareform from scipy.spatial.distance import squareform
from tqdm import tqdm from tqdm import tqdm
import tqdm_joblib import tqdm_joblib
from joblib import Parallel, delayed from joblib import Parallel, delayed
# import modules # import modules
#? eventually the below list will be in your package #? eventually the below list will be in your package
from GeneFamily import GeneFamily from GeneFamily import GeneFamily
from Genome import Genome from Genome import Genome
from MWMInputTreeNode import MWMInputTreeNode from MWMInputTreeNode import MWMInputTreeNode
from mwmatching import * from mwmatching import *
from Contig import * from Contig import *
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Parsing the Configuration File ## Parsing the Configuration File
The configuration file consists of a list of input/output directories, input files and parameters to the pipeline. The configuration file consists of a list of input/output directories, input files and parameters to the pipeline.
Specifications for Input: Specifications for Input:
- leaf_genome_info: the input genomes with their positions on the phylogenetic tree, i.e. *Genomes.txt* <br> - leaf_genome_info: the input genomes with their positions on the phylogenetic tree, i.e. *Genomes.txt* <br>
- synteny_file_name: the post-fix of the synmap output of pairwise genome comparisons; <br> - synteny_file_name: the post-fix of the synmap output of pairwise genome comparisons; <br>
- synteny_file_path: the path to the synmap output data <br> - synteny_file_path: the path to the synmap output data <br>
- phylo_tree_path: the path to the Newick Phylogenetic Tree structure <br> - phylo_tree_path: the path to the Newick Phylogenetic Tree structure <br>
- jar_path: the path to the UniMoG jar file <br> - jar_path: the path to the UniMoG jar file <br>
__Note__: The input genomes is in the *Genomes.txt* file. __Note__: The input genomes is in the *Genomes.txt* file.
It contains the information about input genomes, including CoGe ID, genomeName, ancestor<br> It contains the information about input genomes, including CoGe ID, genomeName, ancestor<br>
The phylogenetic relationship is in the *phyloTree* file. The phylogenetic relationship is in the *phyloTree* file.
It includes the unrooted phylogenetic tree in newick format where each tree leaf is denoted as name_CoGeID. <br> It includes the unrooted phylogenetic tree in newick format where each tree leaf is denoted as name_CoGeID. <br>
Sample Newick Tree for the monocot project: ( ( (Spirodela_51364, Acorus_54711), Dioscorea_51051), ( (Ananas_25734, Elaeis_33018), Asparagus_33908) ) Sample Newick Tree for the monocot project: ( ( (Spirodela_51364, Acorus_54711), Dioscorea_51051), ( (Ananas_25734, Elaeis_33018), Asparagus_33908) )
Specifications for Output: Specifications for Output:
- gene_list: path where the gene list output should be created <br> - gene_list: path where the gene list output should be created <br>
- gene_family: path where the gene family output should be created <br> - gene_family: path where the gene family output should be created <br>
- genomes: path where the genome string output should be created <br> - genomes: path where the genome string output should be created <br>
- mwm_input_template: path and template for the output files created from MWM Input step <br> - mwm_input_template: path and template for the output files created from MWM Input step <br>
- mwm_output_template: path and template for the output files created from MWM <br> - mwm_output_template: path and template for the output files created from MWM <br>
- contig_template: path and template for the output files created from constructing contigs <br> - contig_template: path and template for the output files created from constructing contigs <br>
- dcj_output_path: path to the output file when calculating DCJ distance between ancestors <br> - dcj_output_path: path to the output file when calculating DCJ distance between ancestors <br>
- dcj_summary_path: path to the output file containing a summary of the DCJ calculations <br><br> - dcj_summary_path: path to the output file containing a summary of the DCJ calculations <br><br>
Global parameters: Global parameters:
- min_cutoff_weight: minimum similarity cutoff weight for gene family construction, i.e. *min_cutoff_weight=65* - min_cutoff_weight: minimum similarity cutoff weight for gene family construction, i.e. *min_cutoff_weight=65*
- max_cutoff_weight: maximum similarity cutoff weight for gene family construction, i.e. *max_cutoff_weight=100* - max_cutoff_weight: maximum similarity cutoff weight for gene family construction, i.e. *max_cutoff_weight=100*
- ws: window size, i.e. *ws=7* - ws: window size, i.e. *ws=7*
- min_mwm_weight: minimum adjacency weight to be considered for maximum weight matching, i.e. *min_mwm_weight=100* - min_mwm_weight: minimum adjacency weight to be considered for maximum weight matching, i.e. *min_mwm_weight=100*
- gf1: maximum number of genes in a gene family, i.e. *gf1=50* - gf1: maximum number of genes in a gene family, i.e. *gf1=50*
- gf2: the maximum number of genes from a genome in a gene family, i.e. *gf2=10* - gf2: the maximum number of genes from a genome in a gene family, i.e. *gf2=10*
- gf3: the minimum number of genomes in a gene family, i.e. *gf3=1* - gf3: the minimum number of genomes in a gene family, i.e. *gf3=1*
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# specify your Project Name # specify your Project Name
proj_name = "monocots" proj_name = "buxus"
# reading from the configuration file # reading from the configuration file
config_file = "../inputData/project-" + proj_name + "/config.yaml" config_file = "../inputData/project-" + proj_name + "/config.yaml"
with open(config_file, 'r') as stream: with open(config_file, 'r') as stream:
parsed_config = yaml.load(stream, Loader=yaml.FullLoader) parsed_config = yaml.load(stream, Loader=yaml.FullLoader)
directory = parsed_config['output_path'] + parsed_config['project_name'] directory = parsed_config['output_path'] + parsed_config['project_name']
os.makedirs(directory, exist_ok = True) os.makedirs(directory, exist_ok = True)
print("Project Name and Input Directory:", parsed_config['input_path'] + parsed_config['project_name']) print("Project Name and Input Directory:", parsed_config['input_path'] + parsed_config['project_name'])
print("Project Name and Output Directory:", parsed_config['output_path'] + parsed_config['project_name']) print("Project Name and Output Directory:", parsed_config['output_path'] + parsed_config['project_name'])
print('''Please check required input data under the Input Directory: print('''Please check required input data under the Input Directory:
\t 1. Genome.txt with info about input genomes \t 1. Genome.txt with info about input genomes
\t 2. phyloTree.txt with Newick Tree Structure \t 2. phyloTree.txt with Newick Tree Structure
\t 3. SynMap results between every pair of genomes \t 3. SynMap results between every pair of genomes
''') ''')
#print("Project input data directory:", parsed_config['input_file']['synteny_file_path']) #print("Project input data directory:", parsed_config['input_file']['synteny_file_path'])
#print("Input extant genomes info:", parsed_config['input_file']['leaf_genome_info']) #print("Input extant genomes info:", parsed_config['input_file']['leaf_genome_info'])
#print("The postfix of SynMap output files:", parsed_config['input_file']['synteny_file_name']) #print("The postfix of SynMap output files:", parsed_config['input_file']['synteny_file_name'])
``` ```
%% Output %% Output
Project Name and Input Directory: ../inputData/project-monocots Project Name and Input Directory: ../inputData/project-buxus
Project Name and Output Directory: ../outputData/project-monocots Project Name and Output Directory: ../outputData/project-buxus
Please check required input data under the Input Directory: Please check required input data under the Input Directory:
1. Genome.txt with info about input genomes 1. Genome.txt with info about input genomes
2. phyloTree.txt with Newick Tree Structure 2. phyloTree.txt with Newick Tree Structure
3. SynMap results between every pair of genomes 3. SynMap results between every pair of genomes
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# Section 2: Constructing Gene Families from syntenically validated orthologs from SynMap # Section 2: Constructing Gene Families from syntenically validated orthologs from SynMap
In order to succesfully construct gene families, we require: In order to succesfully construct gene families, we require:
- Successful parsing of the configuration file <br> - Successful parsing of the configuration file <br>
- Valid parameters in the configuration file - Valid parameters in the configuration file
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Reading in Input Genome Data and Tree Structure Data ## Reading in Input Genome Data and Tree Structure Data
Each genome is a leaf of the input phylogenetic tree. The *get_leaves_and_tree_info()* method extracts the following attribute of each genome from the input file specified by *leaf_genome_info* parameter in the configuration file. Each genome is a leaf of the input phylogenetic tree. The *get_leaves_and_tree_info()* method extracts the following attribute of each genome from the input file specified by *leaf_genome_info* parameter in the configuration file.
>1) genome ID; <br> >1) genome ID; <br>
>2) genome name; <br> >2) genome name; <br>
>3) most recent ancestor; <br> >3) most recent ancestor; <br>
>4) number of chromosomes; <br> >4) number of chromosomes; <br>
>5) file name of the genome annotations; <br> >5) file name of the genome annotations; <br>
>6) desired tree structure in Newick format. >6) desired tree structure in Newick format.
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# reading in input genome info # reading in input genome info
gene_family = GeneFamily(parsed_config) gene_family = GeneFamily(parsed_config)
all_leaves, median_structure, newick_structure = gene_family.get_leaves_and_tree_info() all_leaves, median_structure, newick_structure = gene_family.get_leaves_and_tree_info()
print("Extant genomes to be analyzed in this project: \n") print("Extant genomes to be analyzed in this project: \n")
genomes = pd.DataFrame(all_leaves) genomes = pd.DataFrame(all_leaves)
display = genomes.iloc[:, :3] display = genomes.iloc[:, :3]
display.columns = ['Genome ID', 'Genome Name', 'Ancestor'] display.columns = ['Genome ID', 'Genome Name', 'Ancestor']
display.set_index('Genome ID', inplace = True, drop = True) display.set_index('Genome ID', inplace = True, drop = True)
display display
``` ```
%% Output %% Output
Confirm ancestors: Confirm ancestors:
['51364', '54711', '25734,33018,33908,51051'] ['19990', '54057', '57266,57268,28620,35405']
['51364,54711', '51051', '25734,33018,33908'] ['57266', '57268', '54057,19990,28620,35405']
['51364,54711,51051', '33908', '25734,33018'] ['19990,54057', '57266,57268', '28620,35405']
['25734', '33018', '33908,51051,51364,54711'] ['28620', '35405', '54057,19990,57266,57268']
Extant genomes to be analyzed in this project: Extant genomes to be analyzed in this project:
Genome Name Ancestor Genome Name Ancestor
Genome ID Genome ID
25734 Ananas 4 19990 Vitis 1
33018 Elaeis 4 28620 Aquilegea 4
33908 Asparagus 3 35405 Nelumbo 4
51051 Dioscorea 2 54057 Amaranthus 1
51364 Spirodela 1 57266 Tetracentron 2
54711 Acorus 1 57268 Buxus 2
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
tree = Phylo.read(io.StringIO(newick_structure), "newick") tree = Phylo.read(io.StringIO(newick_structure), "newick")
matplotlib.rc('font', size=8) matplotlib.rc('font', size=8)
fig = plt.figure(figsize=(8, 5), dpi=100) fig = plt.figure(figsize=(8, 5), dpi=100)
axes = fig.add_subplot(1, 1, 1) axes = fig.add_subplot(1, 1, 1)
axes.set_facecolor('white') axes.set_facecolor('white')
Phylo.draw(tree, axes = axes) Phylo.draw(tree, axes = axes)
``` ```
%% Output %% Output
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Creating gene families from syntenically validated homology generated from SynMap ## Creating gene families from syntenically validated homology generated from SynMap
Read in all genes for every genome, record gene-related information and use gf1 (from config file), cutoff weights in order to create gene families. Read in all genes for every genome, record gene-related information and use gf1 (from config file), cutoff weights in order to create gene families.
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
print(all_leaves) print(all_leaves)
# creating gene families # creating gene families
gene_family.make_gene_family(all_leaves) gene_family.make_gene_family(all_leaves)
print(len(all_leaves)) print(len(all_leaves))
``` ```
%% Output %% Output
[['25734', 'Ananas', '4', '25'], ['33018', 'Elaeis', '4', '16'], ['33908', 'Asparagus', '3', '10'], ['51051', 'Dioscorea', '2', '21'], ['51364', 'Spirodela', '1', '20'], ['54711', 'Acorus', '1', '12']] [['25734', 'Ananas', '4', '25'], ['33018', 'Elaeis', '4', '16'], ['33908', 'Asparagus', '3', '10'], ['51051', 'Dioscorea', '2', '21'], ['51364', 'Spirodela', '1', '20'], ['54711', 'Acorus', '1', '12']]
6 6
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Visualizing Gene Families ## Visualizing Gene Families
In the configuration file *config.yaml*, there are three parameters to restrict the size of gene families: In the configuration file *config.yaml*, there are three parameters to restrict the size of gene families:
- gf1: the maximum number of genes in a gene family - gf1: the maximum number of genes in a gene family
- gf2: the maximum number of genes from a single genome in a gene family - gf2: the maximum number of genes from a single genome in a gene family
- gf3: the minimum number of genomes in a gene family <br> - gf3: the minimum number of genomes in a gene family <br>
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
families = list(gene_family.gene_family.keys()) families = list(gene_family.gene_family.keys())
values = list(gene_family.gene_family.values()) values = list(gene_family.gene_family.values())
lengths = [len(values[i]) for i in range(0, len(values))] lengths = [len(values[i]) for i in range(0, len(values))]
print("Total number of gene familes:", len(families)) print("Total number of gene familes:", len(families))
print("The size of the largest gene family:", max(lengths)) print("The size of the largest gene family:", max(lengths))
``` ```
%% Output %% Output
Total number of gene familes: 8790 Total number of gene familes: 10236
The size of the largest gene family: 49 The size of the largest gene family: 49
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
sns.set(style="darkgrid") sns.set(style="darkgrid")
family_dict = {'label': families, 'lengths': lengths} family_dict = {'label': families, 'lengths': lengths}
pdfamily = pd.DataFrame(family_dict) pdfamily = pd.DataFrame(family_dict)
fig=sns.histplot(data = pdfamily, x = "lengths", binwidth = 3).set(title = "Overview of Gene Families") fig=sns.histplot(data = pdfamily, x = "lengths", binwidth = 3).set(title = "Overview of Gene Families")
plt.xlabel("Number of genes in a gene family") plt.xlabel("Number of genes in a gene family")
plt.ylabel("Frequency") plt.ylabel("Frequency")
plt.show(fig) plt.show(fig)
``` ```
%% Output %% Output
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# Section 3: Representing Genomes by Gene Family Labels # Section 3: Representing Genomes by Gene Family Labels
In order to succesfully separate the input data (genes) into respective chromosomes and genomes, we require: In order to succesfully separate the input data (genes) into respective chromosomes and genomes, we require:
- Successful creation of gene families - Successful creation of gene families
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Grouping Genes and their ordering by Chromosomes then Chromosomes by Genomes ## Grouping Genes and their ordering by Chromosomes then Chromosomes by Genomes
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# creating an instance of the Genome class # creating an instance of the Genome class
initialize_genome = Genome(gene_family.gene_list, all_leaves, parsed_config) initialize_genome = Genome(gene_family.gene_list, all_leaves, parsed_config)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# separating each genome's genes into chromosomes # separating each genome's genes into chromosomes
genome = initialize_genome.get_genome_in_string() genome = initialize_genome.get_genome_in_string()
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Visualizing Genes Families in Chromosomes of extant genomes ## Visualizing Genes Families in Chromosomes of extant genomes
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
genomes, chromosomes, gene_lengths = [], [], [] genomes, chromosomes, gene_lengths = [], [], []
genome_lengths = [] genome_lengths = []
num_chr = [] num_chr = []
for key, value in genome.items(): for key, value in genome.items():
current_lengths = 0 current_lengths = 0
for genome_id in all_leaves: for genome_id in all_leaves:
if int(genome_id[0]) == int(key): if int(genome_id[0]) == int(key):
genome_name = genome_id[1] genome_name = genome_id[1]
num_chromosomes = int(genome_id[3]) num_chromosomes = int(genome_id[3])
num_chr.append(int(num_chromosomes)) num_chr.append(int(num_chromosomes))
for i, chromosome in enumerate(value): for i, chromosome in enumerate(value):
genomes.append(genome_name) genomes.append(genome_name)
gene_lengths.append(len(chromosome)) gene_lengths.append(len(chromosome))
current_lengths += 1 current_lengths += 1
chromosomes.extend(range(1, num_chromosomes + 1)) chromosomes.extend(range(1, num_chromosomes + 1))
genome_lengths.append(current_lengths) genome_lengths.append(current_lengths)
#print(chromosomes) #print(chromosomes)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
length_index = 0 length_index = 0
# removing short scaffolds for visualization # removing short scaffolds for visualization
for i, lengths in enumerate(genome_lengths): for i, lengths in enumerate(genome_lengths):
left = length_index left = length_index
right = length_index + genome_lengths[i] right = length_index + genome_lengths[i]
gene_lengths[left:right] = sorted(gene_lengths[left:right], reverse = True) gene_lengths[left:right] = sorted(gene_lengths[left:right], reverse = True)
del gene_lengths[left + num_chr[i] : right] del gene_lengths[left + num_chr[i] : right]
del genomes[left + num_chr[i] : right] del genomes[left + num_chr[i] : right]
length_index += num_chr[i] length_index += num_chr[i]
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
print("Visualizing the number of gene families in each chromosome of each genome") print("Visualizing the number of gene families in each chromosome of each genome")
data = {'genome': genomes, 'chromosome number': chromosomes, 'number of gene families': gene_lengths} data = {'genome': genomes, 'chromosome number': chromosomes, 'number of gene families': gene_lengths}
pddata = pd.DataFrame(data) pddata = pd.DataFrame(data)
sns.set(style="darkgrid") sns.set(style="darkgrid")
g = sns.catplot(x = 'chromosome number', y = 'number of gene families', col = 'genome', data = pddata, kind="bar", aspect = .7, sharex = False, dodge = False) g = sns.catplot(x = 'chromosome number', y = 'number of gene families', col = 'genome', data = pddata, kind="bar", aspect = .7, sharex = False, dodge = False)
g.set_xticklabels(rotation = 30) g.set_xticklabels(rotation = 30)
``` ```
%% Output %% Output
Visualizing the number of gene families in each chromosome of each genome Visualizing the number of gene families in each chromosome of each genome
/Users/lij313/opt/anaconda3/lib/python3.8/site-packages/seaborn/categorical.py:3803: UserWarning: Setting `sharex=False` with `color=None` may cause different levels of the `x` variable to share colors. This will change in a future version. /Users/lij313/opt/anaconda3/lib/python3.8/site-packages/seaborn/categorical.py:3803: UserWarning: Setting `sharex=False` with `color=None` may cause different levels of the `x` variable to share colors. This will change in a future version.
warnings.warn(msg.format("sharex", "x"), UserWarning) warnings.warn(msg.format("sharex", "x"), UserWarning)
<seaborn.axisgrid.FacetGrid at 0x7fc296913190> <seaborn.axisgrid.FacetGrid at 0x7fd3b1ac57c0>
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# Section 4: Extract gene proximity (or adjacencies) from each extant genome # Section 4: Extract gene proximity (or adjacencies) from each extant genome
Identify generalized gene adjacencies with a user specified window/gap size. The size parameter (*ws*) is defined in the configuration file. Identify generalized gene adjacencies with a user specified window/gap size. The size parameter (*ws*) is defined in the configuration file.
In order to succesfully prepare adjacency data for the Maximum Weight Matching algorithm, we require: In order to succesfully prepare adjacency data for the Maximum Weight Matching algorithm, we require:
- Successful separation of input data - Successful separation of input data
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Collecting Gene Adjacencies for Maximum Weight Matching ## Collecting Gene Adjacencies for Maximum Weight Matching
Adjacencies are extracted from gene orders in each extant genome. <br> Adjacencies are extracted from gene orders in each extant genome. <br>
A large number of (generalized) adjacencies is collected for each ancestor defined in *median_structure*. A large number of (generalized) adjacencies is collected for each ancestor defined in *median_structure*.
Number of edges are the same since the number of genes and window size is consistent for each ancestor. However, the weights will be different. Number of edges are the same since the number of genes and window size is consistent for each ancestor. However, the weights will be different.
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
### manually change the median structure to construct monoploid extant genome ### manually change the median structure to construct monoploid extant genome
#median_structure=[['61837', '61837', '61837']] #median_structure=[['61837', '61837', '61837']]
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# preparing data for maximum weight matching + maximum weight matching # preparing data for maximum weight matching + maximum weight matching
window_size = parsed_config['ws'] window_size = parsed_config['ws']
min_adj_weight = parsed_config['min_mwm_weight'] min_adj_weight = parsed_config['min_mwm_weight']
num_gene_families = len(gene_family.gene_family) num_gene_families = len(gene_family.gene_family)
input_tree_node = MWMInputTreeNode(genome, all_leaves) input_tree_node = MWMInputTreeNode(genome, all_leaves)
mwmin_output_template = parsed_config['output_file']['mwm_input_template'] mwmin_output_template = parsed_config['output_file']['mwm_input_template']
num_anc = 1 num_anc = 1
mwm_inputs = [None] * len(median_structure) mwm_inputs = [None] * len(median_structure)
for i, structure in tqdm(enumerate(median_structure), total=len(mwm_inputs)): for i, structure in tqdm(enumerate(median_structure), total=len(mwm_inputs)):
outfile_mwmin = directory + mwmin_output_template + str(num_anc) + ".txt" outfile_mwmin = directory + mwmin_output_template + str(num_anc) + ".txt"
mwm_inputs[num_anc - 1] = input_tree_node.get_mwm_input(structure, num_gene_families, window_size, min_adj_weight, outfile_mwmin) mwm_inputs[num_anc - 1] = input_tree_node.get_mwm_input(structure, num_gene_families, window_size, min_adj_weight, outfile_mwmin)
num_anc += 1 num_anc += 1
``` ```
%% Output %% Output
100%|██████████| 4/4 [00:02<00:00, 1.53it/s] 100%|██████████| 4/4 [00:02<00:00, 1.50it/s]
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# Section 5: Maximum Weight Matching # Section 5: Maximum Weight Matching
In order to successfully complete the Maximum Weight Matching algorithm, we require: In order to successfully complete the Maximum Weight Matching algorithm, we require:
- Successful preparation of Maximum Weight Matching Input Data - Successful preparation of Maximum Weight Matching Input Data
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Maximum Weight Matching ## Maximum Weight Matching
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Produces the result of Maximum Weight Matching for each ancestor. <br> Produces the result of Maximum Weight Matching for each ancestor. <br>
This step is computationally expensive. <br> This step is computationally expensive. <br>
Uses multiprocessing to speed up the process. Uses multiprocessing to speed up the process.
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# The path of output file in general form # The path of output file in general form
mwm_output_template = parsed_config['output_file']['mwm_output_template'] mwm_output_template = parsed_config['output_file']['mwm_output_template']
def mwm_matching(mwm_input, num_anc): def mwm_matching(mwm_input, num_anc):
outfile_mwmout = directory + mwm_output_template + str(num_anc) + ".txt" outfile_mwmout = directory + mwm_output_template + str(num_anc) + ".txt"
maxWeightMatching(mwm_input, outfile_mwmout) maxWeightMatching(mwm_input, outfile_mwmout)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# this step is slow # this step is slow
# inputs for multiprocessing # inputs for multiprocessing
numbers = [i for i in range(1, num_anc)] numbers = [i for i in range(1, num_anc)]
inputs = zip(mwm_inputs, numbers) inputs = zip(mwm_inputs, numbers)
start = time.time() start = time.time()
# multiprocessing to make workflow faster # multiprocessing to make workflow faster
with tqdm_joblib.tqdm_joblib(tqdm(desc = "Maximum Weight Matching", total = len(mwm_inputs))) as progress_bar: with tqdm_joblib.tqdm_joblib(tqdm(desc = "Maximum Weight Matching", total = len(mwm_inputs))) as progress_bar:
Parallel(n_jobs=(num_anc - 1))(delayed(mwm_matching)(mwm_inputs[i], numbers[i]) for i in range (0, len(mwm_inputs))) Parallel(n_jobs=(num_anc - 1))(delayed(mwm_matching)(mwm_inputs[i], numbers[i]) for i in range (0, len(mwm_inputs)))
end = time.time() end = time.time()
``` ```
%% Output %% Output
Maximum Weight Matching: 0%| | 0/4 [00:00<?, ?it/s] Maximum Weight Matching: 0%| | 0/4 [00:00<?, ?it/s]
0%| | 0/4 [00:00<?, ?it/s] 0%| | 0/4 [00:00<?, ?it/s]
25%|██▌ | 1/4 [11:27<34:21, 687.08s/it]
50%|█████ | 2/4 [11:41<09:42, 291.39s/it]
75%|███████▌ | 3/4 [11:49<02:42, 162.13s/it]
100%|██████████| 4/4 [12:28<00:00, 187.20s/it]
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
print("MWM solution was computed in %d minutes" % int((end - start)/60)) print("MWM solution was computed in %d minutes" % int((end - start)/60))
``` ```
%% Output
MWM solution was computed in 12 minutes
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# Section 6: Constructing Ancestral Contigs # Section 6: Constructing Ancestral Contigs
In order to succesfully construct ancestral contigs, we require: In order to succesfully construct ancestral contigs, we require:
- Successful Maximum Weight Matching on the prepared input data - Successful Maximum Weight Matching on the prepared input data
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Creating Contigs ## Creating Contigs
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Produces the contigs for each ancestor. Produces the contigs for each ancestor.
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# creating contigs from the maximum weight matching output # creating contigs from the maximum weight matching output
contig_template = parsed_config['output_file']['contig_template'] contig_template = parsed_config['output_file']['contig_template']
# string to use for next step # string to use for next step
dcj_files = [] dcj_files = []
for i in tqdm(range(1, num_anc), total = len(mwm_inputs)): for i in tqdm(range(1, num_anc), total = len(mwm_inputs)):
outfile_mwmout = directory + mwm_output_template + str(i) + ".txt" outfile_mwmout = directory + mwm_output_template + str(i) + ".txt"
outfile_contig = directory + contig_template + str(i) + ".txt" outfile_contig = directory + contig_template + str(i) + ".txt"
dcj_files.append(outfile_contig) dcj_files.append(outfile_contig)
contig = Contig(outfile_mwmout, int(num_gene_families)) contig = Contig(outfile_mwmout, int(num_gene_families))
contig.get_edge() contig.get_edge()
list_telomeres = contig.find_telomeres() list_telomeres = contig.find_telomeres()
contig_list = contig.get_contigs(list_telomeres, outfile_contig, str(i)) contig_list = contig.get_contigs(list_telomeres, outfile_contig, str(i))
``` ```
%% Output
0%| | 0/4 [00:00<?, ?it/s]
50%|█████ | 2/4 [00:00<00:00, 13.52it/s]
100%|██████████| 4/4 [00:00<00:00, 14.02it/s]
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# Section 7: Calculating DCJ Distance Between Ancestors # Section 7: Calculating DCJ Distance Between Ancestors
In order to succesfully calculate the DCJ Distance between ancestors, we require: In order to succesfully calculate the DCJ Distance between ancestors, we require:
- Successful creation of ancestral contigs - Successful creation of ancestral contigs
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Calculating DCJ Distance ## Calculating DCJ Distance
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
jar_path = parsed_config['input_file']['jar_path'] jar_path = parsed_config['input_file']['jar_path']
dcj_output = [] dcj_output = []
# Calculate DCJ Distance # Calculate DCJ Distance
print("Starting DCJ Calculations, May Take a While") print("Starting DCJ Calculations, May Take a While")
for i in tqdm(range(1, num_anc)): for i in tqdm(range(1, num_anc)):
for j in range(i + 1, num_anc): for j in range(i + 1, num_anc):
dcj_output_path = directory + parsed_config['output_file']['dcj_output_path'] + str(i) + "_" + str(j) + ".txt" dcj_output_path = directory + parsed_config['output_file']['dcj_output_path'] + str(i) + "_" + str(j) + ".txt"
command = "java -jar " + jar_path + "/UniMoG-java11.jar " + str(dcj_files[i - 1]) + " " + str(dcj_files[j - 1]) + " -m=1 -p >" + dcj_output_path command = "java -jar " + jar_path + "/UniMoG-java11.jar " + str(dcj_files[i - 1]) + " " + str(dcj_files[j - 1]) + " -m=1 -p >" + dcj_output_path
dcj_output.append(dcj_output_path) dcj_output.append(dcj_output_path)
print(command) print(command)
print("Computing DCJ distance between ancestors", i, "and", j) print("Computing DCJ distance between ancestors", i, "and", j)
os.system(command) os.system(command)
print("Calculation Done") print("Calculation Done")
``` ```
%% Output
0%| | 0/4 [00:00<?, ?it/s]
Starting DCJ Calculations, May Take a While
java -jar ./UniMoG-java11.jar ../outputData/project-monocots/contigAnc1.txt ../outputData/project-monocots/contigAnc2.txt -m=1 -p >../outputData/project-monocots/dcj_output1_2.txt
Computing DCJ distance between ancestors 1 and 2
java -jar ./UniMoG-java11.jar ../outputData/project-monocots/contigAnc1.txt ../outputData/project-monocots/contigAnc3.txt -m=1 -p >../outputData/project-monocots/dcj_output1_3.txt
Computing DCJ distance between ancestors 1 and 3
java -jar ./UniMoG-java11.jar ../outputData/project-monocots/contigAnc1.txt ../outputData/project-monocots/contigAnc4.txt -m=1 -p >../outputData/project-monocots/dcj_output1_4.txt
Computing DCJ distance between ancestors 1 and 4
25%|██▌ | 1/4 [00:22<01:08, 22.83s/it]
java -jar ./UniMoG-java11.jar ../outputData/project-monocots/contigAnc2.txt ../outputData/project-monocots/contigAnc3.txt -m=1 -p >../outputData/project-monocots/dcj_output2_3.txt
Computing DCJ distance between ancestors 2 and 3
java -jar ./UniMoG-java11.jar ../outputData/project-monocots/contigAnc2.txt ../outputData/project-monocots/contigAnc4.txt -m=1 -p >../outputData/project-monocots/dcj_output2_4.txt
Computing DCJ distance between ancestors 2 and 4
50%|█████ | 2/4 [00:37<00:36, 18.02s/it]
java -jar ./UniMoG-java11.jar ../outputData/project-monocots/contigAnc3.txt ../outputData/project-monocots/contigAnc4.txt -m=1 -p >../outputData/project-monocots/dcj_output3_4.txt
Computing DCJ distance between ancestors 3 and 4
100%|██████████| 4/4 [00:44<00:00, 11.24s/it]
Calculation Done
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
dcj_summary = directory + parsed_config['output_file']['dcj_summary_path'] dcj_summary = directory + parsed_config['output_file']['dcj_summary_path']
dcj_matrix = [] dcj_matrix = []
with open(dcj_summary, 'w') as dcj_summary_file: with open(dcj_summary, 'w') as dcj_summary_file:
for i in range(len(median_structure)): for i in range(len(median_structure)):
dcj_summary_file.write("Median Structure for Ancestor " + str((i + 1)) + ": " + "%s" % median_structure[i] + "\n") dcj_summary_file.write("Median Structure for Ancestor " + str((i + 1)) + ": " + "%s" % median_structure[i] + "\n")
for file in dcj_output: for file in dcj_output:
path = file path = file
with open(path, 'r') as dcj_file: with open(path, 'r') as dcj_file:
dcj_info = dcj_file.readlines() dcj_info = dcj_file.readlines()
dcj_summary_file.write(dcj_info[0]) dcj_summary_file.write(dcj_info[0])
print(dcj_info[0]) print(dcj_info[0])
distance = int((dcj_info[0].split())[-1]) distance = int((dcj_info[0].split())[-1])
dcj_matrix.append(distance) dcj_matrix.append(distance)
print("Summay of pairwise DCJ distance -- Done") print("Summay of pairwise DCJ distance -- Done")
``` ```
%% Output
> "Ancestor 1" & "Ancestor 2" (DCJ) : 2572
> "Ancestor 1" & "Ancestor 3" (DCJ) : 3146
> "Ancestor 1" & "Ancestor 4" (DCJ) : 3505
> "Ancestor 2" & "Ancestor 3" (DCJ) : 2478
> "Ancestor 2" & "Ancestor 4" (DCJ) : 3128
> "Ancestor 3" & "Ancestor 4" (DCJ) : 2483
Summay of pairwise DCJ distance -- Done
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
matrix_indices = [("Ancestor " + str(i)) for i in range (1, num_anc)] matrix_indices = [("Ancestor " + str(i)) for i in range (1, num_anc)]
dcj_matrix_summary = pd.DataFrame(squareform(dcj_matrix), index=matrix_indices, columns=matrix_indices) dcj_matrix_summary = pd.DataFrame(squareform(dcj_matrix), index=matrix_indices, columns=matrix_indices)
dcj_matrix_summary.style.set_caption("DCJ Distance Summary Symmetric Matrix") dcj_matrix_summary.style.set_caption("DCJ Distance Summary Symmetric Matrix")
dcj_matrix_summary dcj_matrix_summary
``` ```
%% Output
Ancestor 1 Ancestor 2 Ancestor 3 Ancestor 4
Ancestor 1 0 2572 3146 3505
Ancestor 2 2572 0 2478 3128
Ancestor 3 3146 2478 0 2483
Ancestor 4 3505 3128 2483 0
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
``` ```
......