Compare revisions

7da209fc · 7da209fc · 7da209fc · 7da209fc · 7da209fc · 7da209fc
--- a/module1_2/inputData/project-legumes8/karyotype/karyotype_64734_Quillaja.txt
+++ b/module1_2/inputData/project-legumes8/karyotype/karyotype_64734_Quillaja.txt
 chr	size
-1	19561049
-2	20977047
-3	37318896
-4	34047100
-5	21375137
-6	28584509
-7	26440503
-8	28358298
-9	26458845
-10	21706344
-11	21697428
-12	21470783
-13	18106809
-14	20802709
\ No newline at end of file
+quisa.S10.gnm1.Chr01	19561049
+quisa.S10.gnm1.Chr02	20977047
+quisa.S10.gnm1.Chr03	37318896
+quisa.S10.gnm1.Chr04	34047100
+quisa.S10.gnm1.Chr05	21375137
+quisa.S10.gnm1.Chr06	28584509
+quisa.S10.gnm1.Chr07	26440503
+quisa.S10.gnm1.Chr08	28358298
+quisa.S10.gnm1.Chr09	26458845
+quisa.S10.gnm1.Chr10	21706344
+quisa.S10.gnm1.Chr11	21697428
+quisa.S10.gnm1.Chr12	21470783
+quisa.S10.gnm1.Chr13	18106809
+quisa.S10.gnm1.Chr14	20802709
\ No newline at end of file
--- a/module1_2/inputData/project-legumes8/karyotype/karyotype_64742_Cercis.txt
+++ b/module1_2/inputData/project-legumes8/karyotype/karyotype_64742_Cercis.txt
 chr	size
-1	57468748
-2	51272464
-3	49996358
-4	48286772
-5	45669363
-6	45655421
-7	41929180
\ No newline at end of file
+cerca.ISC453364.gnm3.Chr01	57468748
+cerca.ISC453364.gnm3.Chr02	51272464
+cerca.ISC453364.gnm3.Chr03	49996358
+cerca.ISC453364.gnm3.Chr04	48286772
+cerca.ISC453364.gnm3.Chr05	45669363
+cerca.ISC453364.gnm3.Chr06	45655421
+cerca.ISC453364.gnm3.Chr07	41929180
\ No newline at end of file
--- a/module1_2/inputData/project-legumes8/karyotype/karyotype_64767_Sindora.txt
+++ b/module1_2/inputData/project-legumes8/karyotype/karyotype_64767_Sindora.txt
 chr	size
-1	136071090
-2	116487353
-3	109186524
-4	91914064
-5	88658134
-6	84872623
-7	83373435
-8	81990953
-9	76993406
-10	75912673
-11	75144154
-12	71117942
\ No newline at end of file
+singl.CAF01.gnm1.Chr01	136071090
+singl.CAF01.gnm1.Chr02	116487353
+singl.CAF01.gnm1.Chr03	109186524
+singl.CAF01.gnm1.Chr04	91914064
+singl.CAF01.gnm1.Chr05	88658134
+singl.CAF01.gnm1.Chr06	84872623
+singl.CAF01.gnm1.Chr07	83373435
+singl.CAF01.gnm1.Chr08	81990953
+singl.CAF01.gnm1.Chr09	76993406
+singl.CAF01.gnm1.Chr10	75912673
+singl.CAF01.gnm1.Chr11	75144154
+singl.CAF01.gnm1.Chr12	71117942
\ No newline at end of file
--- a/module1_2/inputData/project-legumes8/karyotype/karyotype_64768_Senna.txt
+++ b/module1_2/inputData/project-legumes8/karyotype/karyotype_64768_Senna.txt
 chr	size
-1	32816166
-2	42009719
-3	37860065
-4	30689712
-5	52777034
-6	46512068
-7	30975534
-8	49705205
-9	35860388
-10	41499577
-11	30871617
-12	29799933
-13	41270226
\ No newline at end of file
+sento.Myeongyun.gnm1.Chr01	32816166
+sento.Myeongyun.gnm1.Chr02	42009719
+sento.Myeongyun.gnm1.Chr03	37860065
+sento.Myeongyun.gnm1.Chr04	30689712
+sento.Myeongyun.gnm1.Chr05	52777034
+sento.Myeongyun.gnm1.Chr06	46512068
+sento.Myeongyun.gnm1.Chr07	30975534
+sento.Myeongyun.gnm1.Chr08	49705205
+sento.Myeongyun.gnm1.Chr09	35860388
+sento.Myeongyun.gnm1.Chr10	41499577
+sento.Myeongyun.gnm1.Chr11	30871617
+sento.Myeongyun.gnm1.Chr12	29799933
+sento.Myeongyun.gnm1.Chr13	41270226
\ No newline at end of file
--- a/module1_2/inputData/project-legumes8/karyotype/karyotype_64962_Prunus.txt
+++ b/module1_2/inputData/project-legumes8/karyotype/karyotype_64962_Prunus.txt
 chr	size
-1	47851208
-2	30405870
-3	27368013
-4	25843236
-5	18496696
-6	30767194
-7	22388614
-8	22573980
\ No newline at end of file
+prupe.Lovell.gnm2.Pp01	47851208
+prupe.Lovell.gnm2.Pp02	30405870
+prupe.Lovell.gnm2.Pp03	27368013
+prupe.Lovell.gnm2.Pp04	25843236
+prupe.Lovell.gnm2.Pp05	18496696
+prupe.Lovell.gnm2.Pp06	30767194
+prupe.Lovell.gnm2.Pp07	22388614
+prupe.Lovell.gnm2.Pp08	22573980
\ No newline at end of file
--- a/module1_2/inputData/project-legumes8/karyotype/karyotype_64963_Phaseolus.txt
+++ b/module1_2/inputData/project-legumes8/karyotype/karyotype_64963_Phaseolus.txt
 chr	size
-1	51433939
-2	49670989
-3	53438756
-4	48048378
-5	40923498
-6	31236378
-7	40041001
-8	63048260
-9	38250102
-10	44302882
-11	53580169
\ No newline at end of file
+phavu.G19833.gnm2.Chr01	51433939
+phavu.G19833.gnm2.Chr02	49670989
+phavu.G19833.gnm2.Chr03	53438756
+phavu.G19833.gnm2.Chr04	48048378
+phavu.G19833.gnm2.Chr05	40923498
+phavu.G19833.gnm2.Chr06	31236378
+phavu.G19833.gnm2.Chr07	40041001
+phavu.G19833.gnm2.Chr08	63048260
+phavu.G19833.gnm2.Chr09	38250102
+phavu.G19833.gnm2.Chr10	44302882
+phavu.G19833.gnm2.Chr11	53580169
\ No newline at end of file
--- a/module1_2/inputData/project-legumes8/karyotype/karyotype_64964_Medicago.txt
+++ b/module1_2/inputData/project-legumes8/karyotype/karyotype_64964_Medicago.txt
 chr	size
-1	52991155
-2	45729672
-3	55515152
-4	56582383
-5	43630510
-6	35275713
-7	49172423
-8	45569985
\ No newline at end of file
+medtr.A17_HM341.gnm4.chr1	52991155
+medtr.A17_HM341.gnm4.chr2	45729672
+medtr.A17_HM341.gnm4.chr3	55515152
+medtr.A17_HM341.gnm4.chr4	56582383
+medtr.A17_HM341.gnm4.chr5	43630510
+medtr.A17_HM341.gnm4.chr6	35275713
+medtr.A17_HM341.gnm4.chr7	49172423
+medtr.A17_HM341.gnm4.chr8	45569985
\ No newline at end of file
--- a/module1_2/inputData/project-legumes8/karyotype/karyotype_64965_Vitis.txt
+++ b/module1_2/inputData/project-legumes8/karyotype/karyotype_64965_Vitis.txt
 chr	size
-1	23037639
-2	18779844
-3	19341862
-4	23867706
-5	25021643
-6	21508407
-7	21026613
-8	22385789
-9	23006712
-10	18140952
-11	19818926
-12	22702307
-13	24396255
-14	30274277
-15	20304914
-16	22053297
-17	17126926
-18	29360087
-19	24021853
\ No newline at end of file
+vitvi.PN40024.gnm2.chr1	23037639
+vitvi.PN40024.gnm2.chr2	18779844
+vitvi.PN40024.gnm2.chr3	19341862
+vitvi.PN40024.gnm2.chr4	23867706
+vitvi.PN40024.gnm2.chr5	25021643
+vitvi.PN40024.gnm2.chr6	21508407
+vitvi.PN40024.gnm2.chr7	21026613
+vitvi.PN40024.gnm2.chr8	22385789
+vitvi.PN40024.gnm2.chr9	23006712
+vitvi.PN40024.gnm2.chr10	18140952
+vitvi.PN40024.gnm2.chr11	19818926
+vitvi.PN40024.gnm2.chr12	22702307
+vitvi.PN40024.gnm2.chr13	24396255
+vitvi.PN40024.gnm2.chr14	30274277
+vitvi.PN40024.gnm2.chr15	20304914
+vitvi.PN40024.gnm2.chr16	22053297
+vitvi.PN40024.gnm2.chr17	17126926
+vitvi.PN40024.gnm2.chr18	29360087
+vitvi.PN40024.gnm2.chr19	24021853
\ No newline at end of file
--- a/module1_2/inputData/project-monocots/config.yaml
+++ b/module1_2/inputData/project-monocots/config.yaml
 # project name
-project_name: "project-monocots"
+project_name: "project-buxus"

 # input : path to genome information file
 input_file:
@@ -52,21 +52,21 @@ gf3: 1
 K: 7

 ## set thresholds
-ctgLen: 1   ## minimum length in each contig
-nctg: 250    ## only include first nctg contigs
+#ctgLen: 1   ## minimum length in each contig
+#nctg: 250    ## only include first nctg contigs

 ## threshold of distance (1MB) between two genes in the same contig on the same chromosome
 ## if the distance between two genes is smaller than the threshold, merge them into one block
-DIS.threshold: 1000000
+#DIS.threshold: 1000000

 ## threshold to visualize the gene/block (150 kb)
 ## if a block is less than this length, do not show it --> keep the plot clean
-blockLEN.threshold: 150000
+#blockLEN.threshold: 150000


 ## threshold for coocurrence analysis
 ## only count blocks that are longer than 15KB
-lenBLK.threshold: 15000
+#lenBLK.threshold: 15000




--- a/module1_2/inputData/project-monocots/karyotype/karyotype_25734_Ananas.txt
+++ b/module1_2/inputData/project-monocots/karyotype/karyotype_25734_Ananas.txt
+chr	size
+LG01	24880688
+LG02	17334668
+LG03	16781886
+LG04	15584765
+LG05	15024279
+LG06	14747297
+LG07	14728688
+LG08	13970067
+LG09	13841557
+LG10	13112115
+LG11	13096665
+LG12	12612916
+LG13	11759267
+LG14	11705491
+LG15	11365478
+LG16	11174208
+LG17	11156084
+LG18	10911977
+LG19	10815949
+LG20	10792357
+LG21	10645290
+LG22	10398566
+LG23	7979763
+LG24	7684463
+LG25	3739203
--- a/module1_2/inputData/project-monocots/karyotype/karyotype_33018_Elaeis.txt
+++ b/module1_2/inputData/project-monocots/karyotype/karyotype_33018_Elaeis.txt
+chr	size
+REF_ELAGV01 	68435666
+REF_ELAGV02 	65558141
+REF_ELAGV03 	60060032
+REF_ELAGV04 	57251647
+REF_ELAGV05 	51955539
+REF_ELAGV06 	44357769
+REF_ELAGV07 	43454766
+REF_ELAGV08 	40195399
+REF_ELAGV09 	38056896
+REF_ELAGV10 	31890735
+REF_ELAGV11	30068910
+REF_ELAGV12	28800575
+REF_ELAGV13	27817470
+REF_ELAGV14	24379743
+REF_ELAGV15	24314465
+REF_ELAGV16	21371083
--- a/module1_2/inputData/project-monocots/karyotype/karyotype_33908_Asparagus.txt
+++ b/module1_2/inputData/project-monocots/karyotype/karyotype_33908_Asparagus.txt
+chr	size
+AsparagusV1_01	132393784
+AsparagusV1_02	84732013
+AsparagusV1_03	113259003
+AsparagusV1_04	146560859
+AsparagusV1_05	131469567
+AsparagusV1_06	78779235
+AsparagusV1_07	151357931
+AsparagusV1_08	131339754
+AsparagusV1_09	68543460
+AsparagusV1_10	73452336
--- a/module1_2/inputData/project-monocots/karyotype/karyotype_51051_Dioscorea.txt
+++ b/module1_2/inputData/project-monocots/karyotype/karyotype_51051_Dioscorea.txt
+chr	size
+1	31431123
+2	33969142
+3	19547260
+4	27749968
+5	32871075
+6	32866201
+7	18053115
+8	28021425
+9	23324680
+10	17904088
+11	17195437
+12	25272979
+13	29719729
+14	15713101
+15	10474557
+16	23371965
+17	20585887
+18	22795539
+19	9927364
+20	10677198
+21	5203141
--- a/module1_2/inputData/project-monocots/karyotype/karyotype_51364_Spirodela.txt
+++ b/module1_2/inputData/project-monocots/karyotype/karyotype_51364_Spirodela.txt
+chr	size
+1	11560055
+2	8180930
+3	8870115
+4	9537711
+5	7072156
+6	8333076
+7	7949387
+8	8482602
+9	7754086
+10	7963964
+11	6749155	
+12	5993893
+13	6317482
+14	5575328
+15	4557367
+16	5820572
+17	4601490
+18	5315945
+19	3959434
+20	3997407
--- a/module1_2/inputData/project-monocots/karyotype/karyotype_54711_Acorus.txt
+++ b/module1_2/inputData/project-monocots/karyotype/karyotype_54711_Acorus.txt
+chr	size
1	37743429
2	34360390
3	33772675
4	31994915
5	31167455
6	30970160
7	30771956
8	28400560
9	28235908
10	25518903
11	25169648
12	24790462
\ No newline at end of file
--- a/module1_2/src/Genome.py
+++ b/module1_2/src/Genome.py
@@ -81,7 +81,7 @@ class Genome:
                    f.write(chromosome + " ")

                    for gene in genes:
-                        f.write("%s " % genes)
+                        f.write("%s " % gene)

                    f.write("\n")


--- a/module1_2/src/Module1_2.ipynb
+++ b/module1_2/src/Module1_2.ipynb
 %% Cell type:markdown id: tags:

 # RACCROCHE - Module 1 & 2
 This is a full workflow that shows methods regarding the construction of gene families, listing of candidate adjacencies and construction of ancestral contigs using Maximum Weight Matching.

 %% Cell type:markdown id: tags:

 # Section 1: Importing Libraries, Modules and Configuration File

 %% Cell type:code id: tags:

 ``` python
 # setting the Notebook Display Method
 %matplotlib inline
 ```

 %% Cell type:code id: tags:

 ``` python
 # install tqdm-joblib
 %pip install tqdm-joblib

 # install Bio
 %pip install Bio
 ```

 %% Cell type:code id: tags:

 ``` python
 # import libraries
 import sys
 import yaml
 import time
 import os
 import io

 import pandas as pd
 import seaborn as sns
 import matplotlib
 import matplotlib.pyplot as plt
 from Bio import Phylo
 from scipy.spatial.distance import squareform

 from tqdm import tqdm
 import tqdm_joblib
 from joblib import Parallel, delayed

 # import modules
 #? eventually the below list will be in your package
 from GeneFamily import GeneFamily
 from Genome import Genome
 from MWMInputTreeNode import MWMInputTreeNode
 from mwmatching import *
 from Contig import *
 ```

 %% Cell type:markdown id: tags:

 ## Parsing the Configuration File

 The configuration file consists of a list of input/output directories, input files and parameters to the pipeline.

 Specifications for Input:
 - leaf_genome_info: the input genomes with their positions on the phylogenetic tree, i.e. *Genomes.txt* <br>
 - synteny_file_name: the post-fix of the synmap output of pairwise genome comparisons; <br>
 - synteny_file_path: the path to the synmap output data <br>
 - phylo_tree_path: the path to the Newick Phylogenetic Tree structure <br>
 - jar_path: the path to the UniMoG jar file <br>

 __Note__: The input genomes is in the *Genomes.txt* file.
 It contains the information about input genomes, including CoGe ID, genomeName, ancestor<br>
 The phylogenetic relationship is in the *phyloTree* file.
 It includes the unrooted phylogenetic tree in newick format where each tree leaf is denoted as name_CoGeID. <br>
 Sample Newick Tree for the monocot project: ( ( (Spirodela_51364, Acorus_54711), Dioscorea_51051), ( (Ananas_25734, Elaeis_33018), Asparagus_33908) )

 Specifications for Output:
 - gene_list: path where the gene list output should be created <br>
 - gene_family: path where the gene family output should be created <br>
 - genomes: path where the genome string output should be created <br>
 - mwm_input_template: path and template for the output files created from MWM Input step <br>
 - mwm_output_template: path and template for the output files created from MWM <br>
 - contig_template: path and template for the output files created from constructing contigs <br>
 - dcj_output_path: path to the output file when calculating DCJ distance between ancestors <br>
 - dcj_summary_path: path to the output file containing a summary of the DCJ calculations <br><br>

 Global parameters:
 - min_cutoff_weight: minimum similarity cutoff weight for gene family construction, i.e. *min_cutoff_weight=65*
 - max_cutoff_weight: maximum similarity cutoff weight for gene family construction, i.e. *max_cutoff_weight=100*
 - ws: window size, i.e. *ws=7*
 - min_mwm_weight: minimum adjacency weight to be considered for maximum weight matching, i.e. *min_mwm_weight=100*
 - gf1: maximum number of genes in a gene family, i.e. *gf1=50*
 - gf2: the maximum number of genes from a genome in a gene family, i.e. *gf2=10*
 - gf3: the minimum number of genomes in a gene family, i.e. *gf3=1*

 %% Cell type:code id: tags:

 ``` python
 # specify your Project Name

-proj_name = "monocots"
+proj_name = "buxus"

 # reading from the configuration file
 config_file = "../inputData/project-" + proj_name + "/config.yaml"

 with open(config_file, 'r') as stream:
    parsed_config = yaml.load(stream, Loader=yaml.FullLoader)

 directory = parsed_config['output_path'] + parsed_config['project_name']
 os.makedirs(directory, exist_ok = True)

 print("Project Name and Input Directory:", parsed_config['input_path'] + parsed_config['project_name'])
 print("Project Name and Output Directory:", parsed_config['output_path'] + parsed_config['project_name'])
 print('''Please check required input data under the Input Directory:
 \t 1. Genome.txt with info about input genomes
 \t 2. phyloTree.txt with Newick Tree Structure
 \t 3. SynMap results between every pair of genomes
 ''')
 #print("Project input data directory:", parsed_config['input_file']['synteny_file_path'])
 #print("Input extant genomes info:", parsed_config['input_file']['leaf_genome_info'])
 #print("The postfix of SynMap output files:", parsed_config['input_file']['synteny_file_name'])
 ```

 %% Output

-    Project Name and Input Directory: ../inputData/project-monocots
-    Project Name and Output Directory: ../outputData/project-monocots
+    Project Name and Input Directory: ../inputData/project-buxus
+    Project Name and Output Directory: ../outputData/project-buxus
    Please check required input data under the Input Directory:
    	 1. Genome.txt with info about input genomes
    	 2. phyloTree.txt with Newick Tree Structure
    	 3. SynMap results between every pair of genomes
    

 %% Cell type:markdown id: tags:

 # Section 2: Constructing Gene Families from syntenically validated orthologs from SynMap
 In order to succesfully construct gene families, we require:
 - Successful parsing of the configuration file <br>
 - Valid parameters in the configuration file

 %% Cell type:markdown id: tags:

 ## Reading in Input Genome Data and Tree Structure Data
 Each genome is a leaf of the input phylogenetic tree. The *get_leaves_and_tree_info()* method extracts the following attribute of each genome from the input file specified by *leaf_genome_info* parameter in the configuration file.
 >1) genome ID; <br>
 >2) genome name; <br>
 >3) most recent ancestor; <br>
 >4) number of chromosomes; <br>
 >5) file name of the genome annotations; <br>
 >6) desired tree structure in Newick format.

 %% Cell type:code id: tags:

 ``` python
 # reading in input genome info
 gene_family = GeneFamily(parsed_config)
 all_leaves, median_structure, newick_structure = gene_family.get_leaves_and_tree_info()

 print("Extant genomes to be analyzed in this project: \n")

 genomes = pd.DataFrame(all_leaves)
 display = genomes.iloc[:, :3]

 display.columns = ['Genome ID', 'Genome Name', 'Ancestor']
 display.set_index('Genome ID', inplace = True, drop = True)

 display
 ```

 %% Output

    Confirm ancestors:
-    ['51364', '54711', '25734,33018,33908,51051']
-    ['51364,54711', '51051', '25734,33018,33908']
-    ['51364,54711,51051', '33908', '25734,33018']
-    ['25734', '33018', '33908,51051,51364,54711']
+    ['19990', '54057', '57266,57268,28620,35405']
+    ['57266', '57268', '54057,19990,28620,35405']
+    ['19990,54057', '57266,57268', '28620,35405']
+    ['28620', '35405', '54057,19990,57266,57268']
    
    Extant genomes to be analyzed in this project:
    

-              Genome Name Ancestor
+                Genome Name Ancestor
    Genome ID
-    25734          Ananas        4
-    33018          Elaeis        4
-    33908       Asparagus        3
-    51051       Dioscorea        2
-    51364       Spirodela        1
-    54711          Acorus        1
+    19990             Vitis        1
+    28620         Aquilegea        4
+    35405           Nelumbo        4
+    54057        Amaranthus        1
+    57266      Tetracentron        2
+    57268             Buxus        2

 %% Cell type:code id: tags:

 ``` python
 tree = Phylo.read(io.StringIO(newick_structure), "newick")

 matplotlib.rc('font', size=8)
 fig = plt.figure(figsize=(8, 5), dpi=100)
 axes = fig.add_subplot(1, 1, 1)
 axes.set_facecolor('white')

 Phylo.draw(tree, axes = axes)
 ```

 %% Output

-
+

 %% Cell type:markdown id: tags:

 ## Creating gene families from syntenically validated homology generated from SynMap

 Read in all genes for every genome, record gene-related information and use gf1 (from config file), cutoff weights in order to create gene families.

 %% Cell type:code id: tags:

 ``` python
 print(all_leaves)
 # creating gene families
 gene_family.make_gene_family(all_leaves)

 print(len(all_leaves))
 ```

 %% Output

    [['25734', 'Ananas', '4', '25'], ['33018', 'Elaeis', '4', '16'], ['33908', 'Asparagus', '3', '10'], ['51051', 'Dioscorea', '2', '21'], ['51364', 'Spirodela', '1', '20'], ['54711', 'Acorus', '1', '12']]
    6

 %% Cell type:markdown id: tags:

 ## Visualizing Gene Families
 In the configuration file *config.yaml*, there are three parameters to restrict the size of gene families:
 - gf1: the maximum number of genes in a gene family
 - gf2: the maximum number of genes from a single genome in a gene family
 - gf3: the minimum number of genomes in a gene family <br>

 %% Cell type:code id: tags:

 ``` python
 families = list(gene_family.gene_family.keys())

 values = list(gene_family.gene_family.values())
 lengths = [len(values[i]) for i in range(0, len(values))]

 print("Total number of gene familes:", len(families))
 print("The size of the largest gene family:", max(lengths))
 ```

 %% Output

-    Total number of gene familes: 8790
+    Total number of gene familes: 10236
    The size of the largest gene family: 49

 %% Cell type:code id: tags:

 ``` python
 sns.set(style="darkgrid")
 family_dict = {'label': families, 'lengths': lengths}
 pdfamily = pd.DataFrame(family_dict)

 fig=sns.histplot(data = pdfamily, x = "lengths", binwidth = 3).set(title = "Overview of Gene Families")

 plt.xlabel("Number of genes in a gene family")
 plt.ylabel("Frequency")
 plt.show(fig)
 ```

 %% Output

-
+

 %% Cell type:markdown id: tags:

 # Section 3: Representing Genomes by Gene Family Labels
 In order to succesfully separate the input data (genes) into respective chromosomes and genomes, we require:
 - Successful creation of gene families

 %% Cell type:markdown id: tags:

 ## Grouping Genes and their ordering by Chromosomes then Chromosomes by Genomes

 %% Cell type:code id: tags:

 ``` python
 # creating an instance of the Genome class
 initialize_genome = Genome(gene_family.gene_list, all_leaves, parsed_config)
 ```

 %% Cell type:code id: tags:

 ``` python
 # separating each genome's genes into chromosomes
 genome = initialize_genome.get_genome_in_string()
 ```

 %% Cell type:markdown id: tags:

 ## Visualizing Genes Families in Chromosomes of extant genomes

 %% Cell type:code id: tags:

 ``` python
 genomes, chromosomes, gene_lengths = [], [], []
 genome_lengths = []
 num_chr = []

 for key, value in genome.items():
    current_lengths = 0

    for genome_id in all_leaves:
        if int(genome_id[0]) == int(key):
            genome_name = genome_id[1]
            num_chromosomes = int(genome_id[3])
            num_chr.append(int(num_chromosomes))

    for i, chromosome in enumerate(value):
        genomes.append(genome_name)
        gene_lengths.append(len(chromosome))
        current_lengths += 1

    chromosomes.extend(range(1, num_chromosomes + 1))

    genome_lengths.append(current_lengths)

 #print(chromosomes)
 ```

 %% Cell type:code id: tags:

 ``` python
 length_index = 0

 # removing short scaffolds for visualization
 for i, lengths in enumerate(genome_lengths):
    left = length_index
    right = length_index + genome_lengths[i]

    gene_lengths[left:right] = sorted(gene_lengths[left:right], reverse = True)
    del gene_lengths[left + num_chr[i] : right]
    del genomes[left + num_chr[i] : right]

    length_index += num_chr[i]
 ```

 %% Cell type:code id: tags:

 ``` python
 print("Visualizing the number of gene families in each chromosome of each genome")

 data = {'genome': genomes, 'chromosome number': chromosomes, 'number of gene families': gene_lengths}

 pddata = pd.DataFrame(data)

 sns.set(style="darkgrid")

 g = sns.catplot(x = 'chromosome number', y = 'number of gene families', col = 'genome', data = pddata, kind="bar", aspect = .7, sharex = False, dodge = False)
 g.set_xticklabels(rotation = 30)
 ```

 %% Output

    Visualizing the number of gene families in each chromosome of each genome

    /Users/lij313/opt/anaconda3/lib/python3.8/site-packages/seaborn/categorical.py:3803: UserWarning: Setting `sharex=False` with `color=None` may cause different levels of the `x` variable to share colors. This will change in a future version.
      warnings.warn(msg.format("sharex", "x"), UserWarning)

-    <seaborn.axisgrid.FacetGrid at 0x7fc296913190>
+    <seaborn.axisgrid.FacetGrid at 0x7fd3b1ac57c0>



 %% Cell type:markdown id: tags:

 # Section 4: Extract gene proximity (or adjacencies) from each extant genome

 Identify generalized gene adjacencies with a user specified window/gap size. The size parameter (*ws*) is defined in the configuration file.
 In order to succesfully prepare adjacency data for the Maximum Weight Matching algorithm, we require:
 - Successful separation of input data

 %% Cell type:markdown id: tags:

 ## Collecting Gene Adjacencies for Maximum Weight Matching

 Adjacencies are extracted from gene orders in each extant genome. <br>
 A large number of (generalized) adjacencies is collected for each ancestor defined in *median_structure*.


 Number of edges are the same since the number of genes and window size is consistent for each ancestor. However, the weights will be different.

 %% Cell type:code id: tags:

 ``` python
 ### manually change the median structure to construct monoploid extant genome
 #median_structure=[['61837', '61837', '61837']]
 ```

 %% Cell type:code id: tags:

 ``` python
 # preparing data for maximum weight matching + maximum weight matching
 window_size = parsed_config['ws']
 min_adj_weight = parsed_config['min_mwm_weight']
 num_gene_families = len(gene_family.gene_family)

 input_tree_node = MWMInputTreeNode(genome, all_leaves)

 mwmin_output_template = parsed_config['output_file']['mwm_input_template']

 num_anc = 1
 mwm_inputs = [None] * len(median_structure)

 for i, structure in tqdm(enumerate(median_structure), total=len(mwm_inputs)):
    outfile_mwmin = directory + mwmin_output_template + str(num_anc) + ".txt"
    mwm_inputs[num_anc - 1] = input_tree_node.get_mwm_input(structure, num_gene_families, window_size, min_adj_weight, outfile_mwmin)
    num_anc += 1
 ```

 %% Output

-    100%|██████████| 4/4 [00:02<00:00,  1.53it/s]
+    100%|██████████| 4/4 [00:02<00:00,  1.50it/s]

 %% Cell type:markdown id: tags:

 # Section 5: Maximum Weight Matching
 In order to successfully complete the Maximum Weight Matching algorithm, we require:
 - Successful preparation of Maximum Weight Matching Input Data

 %% Cell type:markdown id: tags:

 ## Maximum Weight Matching

 %% Cell type:markdown id: tags:

 Produces the result of Maximum Weight Matching for each ancestor. <br>
 This step is computationally expensive. <br>
 Uses multiprocessing to speed up the process.

 %% Cell type:code id: tags:

 ``` python
 # The path of output file in general form
 mwm_output_template = parsed_config['output_file']['mwm_output_template']

 def mwm_matching(mwm_input, num_anc):
    outfile_mwmout = directory + mwm_output_template + str(num_anc) + ".txt"
    maxWeightMatching(mwm_input, outfile_mwmout)
 ```

 %% Cell type:code id: tags:

 ``` python
 # this step is slow
 # inputs for multiprocessing
 numbers = [i for i in range(1, num_anc)]
 inputs = zip(mwm_inputs, numbers)

 start = time.time()

 # multiprocessing to make workflow faster
 with tqdm_joblib.tqdm_joblib(tqdm(desc = "Maximum Weight Matching", total = len(mwm_inputs))) as progress_bar:
    Parallel(n_jobs=(num_anc - 1))(delayed(mwm_matching)(mwm_inputs[i], numbers[i]) for i in range (0, len(mwm_inputs)))

 end = time.time()
 ```

 %% Output

    Maximum Weight Matching:   0%|          | 0/4 [00:00<?, ?it/s]
      0%|          | 0/4 [00:00<?, ?it/s][A
+     25%|██▌       | 1/4 [11:27<34:21, 687.08s/it][A
+     50%|█████     | 2/4 [11:41<09:42, 291.39s/it][A
+     75%|███████▌  | 3/4 [11:49<02:42, 162.13s/it][A
+    100%|██████████| 4/4 [12:28<00:00, 187.20s/it][A

 %% Cell type:code id: tags:

 ``` python
 print("MWM solution was computed in %d minutes" % int((end - start)/60))
 ```

+%% Output
+
+    MWM solution was computed in 12 minutes
+
 %% Cell type:markdown id: tags:

 # Section 6: Constructing Ancestral Contigs
 In order to succesfully construct ancestral contigs, we require:
 - Successful Maximum Weight Matching on the prepared input data

 %% Cell type:markdown id: tags:

 ## Creating Contigs

 %% Cell type:markdown id: tags:

 Produces the contigs for each ancestor.

 %% Cell type:code id: tags:

 ``` python
 # creating contigs from the maximum weight matching output
 contig_template = parsed_config['output_file']['contig_template']

 # string to use for next step
 dcj_files = []

 for i in tqdm(range(1, num_anc), total = len(mwm_inputs)):
    outfile_mwmout = directory + mwm_output_template + str(i) + ".txt"
    outfile_contig = directory + contig_template + str(i) + ".txt"
    dcj_files.append(outfile_contig)

    contig = Contig(outfile_mwmout, int(num_gene_families))
    contig.get_edge()
    list_telomeres = contig.find_telomeres()

    contig_list = contig.get_contigs(list_telomeres, outfile_contig, str(i))
 ```

+%% Output
+
+    
+      0%|          | 0/4 [00:00<?, ?it/s][A
+     50%|█████     | 2/4 [00:00<00:00, 13.52it/s][A
+    100%|██████████| 4/4 [00:00<00:00, 14.02it/s][A
+
 %% Cell type:markdown id: tags:

 # Section 7: Calculating DCJ Distance Between Ancestors
 In order to succesfully calculate the DCJ Distance between ancestors, we require:
 - Successful creation of ancestral contigs

 %% Cell type:markdown id: tags:

 ## Calculating DCJ Distance

 %% Cell type:code id: tags:

 ``` python
 jar_path = parsed_config['input_file']['jar_path']

 dcj_output = []

 # Calculate DCJ Distance
 print("Starting DCJ Calculations, May Take a While")

 for i in tqdm(range(1, num_anc)):
    for j in range(i + 1, num_anc):
        dcj_output_path = directory + parsed_config['output_file']['dcj_output_path'] + str(i) + "_" + str(j) + ".txt"
        command = "java -jar " + jar_path + "/UniMoG-java11.jar " + str(dcj_files[i - 1]) + " " + str(dcj_files[j - 1]) + " -m=1 -p >" + dcj_output_path
        dcj_output.append(dcj_output_path)
        print(command)
        print("Computing DCJ distance between ancestors", i, "and", j)
        os.system(command)

 print("Calculation Done")
 ```

+%% Output
+
+    
+      0%|          | 0/4 [00:00<?, ?it/s][A
+
+    Starting DCJ Calculations, May Take a While
+    java -jar ./UniMoG-java11.jar ../outputData/project-monocots/contigAnc1.txt ../outputData/project-monocots/contigAnc2.txt -m=1 -p >../outputData/project-monocots/dcj_output1_2.txt
+    Computing DCJ distance between ancestors 1 and 2
+    java -jar ./UniMoG-java11.jar ../outputData/project-monocots/contigAnc1.txt ../outputData/project-monocots/contigAnc3.txt -m=1 -p >../outputData/project-monocots/dcj_output1_3.txt
+    Computing DCJ distance between ancestors 1 and 3
+    java -jar ./UniMoG-java11.jar ../outputData/project-monocots/contigAnc1.txt ../outputData/project-monocots/contigAnc4.txt -m=1 -p >../outputData/project-monocots/dcj_output1_4.txt
+    Computing DCJ distance between ancestors 1 and 4
+
+    
+     25%|██▌       | 1/4 [00:22<01:08, 22.83s/it][A
+
+    java -jar ./UniMoG-java11.jar ../outputData/project-monocots/contigAnc2.txt ../outputData/project-monocots/contigAnc3.txt -m=1 -p >../outputData/project-monocots/dcj_output2_3.txt
+    Computing DCJ distance between ancestors 2 and 3
+    java -jar ./UniMoG-java11.jar ../outputData/project-monocots/contigAnc2.txt ../outputData/project-monocots/contigAnc4.txt -m=1 -p >../outputData/project-monocots/dcj_output2_4.txt
+    Computing DCJ distance between ancestors 2 and 4
+
+    
+     50%|█████     | 2/4 [00:37<00:36, 18.02s/it][A
+
+    java -jar ./UniMoG-java11.jar ../outputData/project-monocots/contigAnc3.txt ../outputData/project-monocots/contigAnc4.txt -m=1 -p >../outputData/project-monocots/dcj_output3_4.txt
+    Computing DCJ distance between ancestors 3 and 4
+
+    
+    100%|██████████| 4/4 [00:44<00:00, 11.24s/it][A
+
+    Calculation Done
+
+    
+
 %% Cell type:code id: tags:

 ``` python
 dcj_summary = directory + parsed_config['output_file']['dcj_summary_path']
 dcj_matrix = []

 with open(dcj_summary, 'w') as dcj_summary_file:
    for i in range(len(median_structure)):
            dcj_summary_file.write("Median Structure for Ancestor " + str((i + 1)) + ": " + "%s" % median_structure[i] + "\n")

    for file in dcj_output:
        path = file

        with open(path, 'r') as dcj_file:
            dcj_info = dcj_file.readlines()

        dcj_summary_file.write(dcj_info[0])
        print(dcj_info[0])

        distance = int((dcj_info[0].split())[-1])
        dcj_matrix.append(distance)

 print("Summay of pairwise DCJ distance -- Done")
 ```

+%% Output
+
+    > "Ancestor 1" & "Ancestor 2" (DCJ) : 2572
+    
+    > "Ancestor 1" & "Ancestor 3" (DCJ) : 3146
+    
+    > "Ancestor 1" & "Ancestor 4" (DCJ) : 3505
+    
+    > "Ancestor 2" & "Ancestor 3" (DCJ) : 2478
+    
+    > "Ancestor 2" & "Ancestor 4" (DCJ) : 3128
+    
+    > "Ancestor 3" & "Ancestor 4" (DCJ) : 2483
+    
+    Summay of pairwise DCJ distance -- Done
+
 %% Cell type:code id: tags:

 ``` python
 matrix_indices = [("Ancestor " + str(i)) for i in range (1, num_anc)]
 dcj_matrix_summary = pd.DataFrame(squareform(dcj_matrix), index=matrix_indices, columns=matrix_indices)
 dcj_matrix_summary.style.set_caption("DCJ Distance Summary Symmetric Matrix")
 dcj_matrix_summary
 ```

+%% Output
+
+                Ancestor 1  Ancestor 2  Ancestor 3  Ancestor 4
+    Ancestor 1           0        2572        3146        3505
+    Ancestor 2        2572           0        2478        3128
+    Ancestor 3        3146        2478           0        2483
+    Ancestor 4        3505        3128        2483           0
+
+%% Cell type:code id: tags:
+
+``` python
+```
+
 %% Cell type:code id: tags:

 ``` python
 ```

 %% Cell type:markdown id: tags:

 # RACCROCHE - Module 1 & 2
 This is a full workflow that shows methods regarding the construction of gene families, listing of candidate adjacencies and construction of ancestral contigs using Maximum Weight Matching.

 %% Cell type:markdown id: tags:

 # Section 1: Importing Libraries, Modules and Configuration File

 %% Cell type:code id: tags:

 ``` python
 # setting the Notebook Display Method
 %matplotlib inline
 ```

 %% Cell type:code id: tags:

 ``` python
 # install tqdm-joblib
 %pip install tqdm-joblib

 # install Bio
 %pip install Bio
 ```

 %% Cell type:code id: tags:

 ``` python
 # import libraries
 import sys
 import yaml
 import time
 import os
 import io

 import pandas as pd
 import seaborn as sns
 import matplotlib
 import matplotlib.pyplot as plt
 from Bio import Phylo
 from scipy.spatial.distance import squareform

 from tqdm import tqdm
 import tqdm_joblib
 from joblib import Parallel, delayed

 # import modules
 #? eventually the below list will be in your package
 from GeneFamily import GeneFamily
 from Genome import Genome
 from MWMInputTreeNode import MWMInputTreeNode
 from mwmatching import *
 from Contig import *
 ```

 %% Cell type:markdown id: tags:

 ## Parsing the Configuration File

 The configuration file consists of a list of input/output directories, input files and parameters to the pipeline.

 Specifications for Input:
 - leaf_genome_info: the input genomes with their positions on the phylogenetic tree, i.e. *Genomes.txt* <br>
 - synteny_file_name: the post-fix of the synmap output of pairwise genome comparisons; <br>
 - synteny_file_path: the path to the synmap output data <br>
 - phylo_tree_path: the path to the Newick Phylogenetic Tree structure <br>
 - jar_path: the path to the UniMoG jar file <br>

 __Note__: The input genomes is in the *Genomes.txt* file.
 It contains the information about input genomes, including CoGe ID, genomeName, ancestor<br>
 The phylogenetic relationship is in the *phyloTree* file.
 It includes the unrooted phylogenetic tree in newick format where each tree leaf is denoted as name_CoGeID. <br>
 Sample Newick Tree for the monocot project: ( ( (Spirodela_51364, Acorus_54711), Dioscorea_51051), ( (Ananas_25734, Elaeis_33018), Asparagus_33908) )

 Specifications for Output:
 - gene_list: path where the gene list output should be created <br>
 - gene_family: path where the gene family output should be created <br>
 - genomes: path where the genome string output should be created <br>
 - mwm_input_template: path and template for the output files created from MWM Input step <br>
 - mwm_output_template: path and template for the output files created from MWM <br>
 - contig_template: path and template for the output files created from constructing contigs <br>
 - dcj_output_path: path to the output file when calculating DCJ distance between ancestors <br>
 - dcj_summary_path: path to the output file containing a summary of the DCJ calculations <br><br>

 Global parameters:
 - min_cutoff_weight: minimum similarity cutoff weight for gene family construction, i.e. *min_cutoff_weight=65*
 - max_cutoff_weight: maximum similarity cutoff weight for gene family construction, i.e. *max_cutoff_weight=100*
 - ws: window size, i.e. *ws=7*
 - min_mwm_weight: minimum adjacency weight to be considered for maximum weight matching, i.e. *min_mwm_weight=100*
 - gf1: maximum number of genes in a gene family, i.e. *gf1=50*
 - gf2: the maximum number of genes from a genome in a gene family, i.e. *gf2=10*
 - gf3: the minimum number of genomes in a gene family, i.e. *gf3=1*

 %% Cell type:code id: tags:

 ``` python
 # specify your Project Name

-proj_name = "monocots"
+proj_name = "buxus"

 # reading from the configuration file
 config_file = "../inputData/project-" + proj_name + "/config.yaml"

 with open(config_file, 'r') as stream:
    parsed_config = yaml.load(stream, Loader=yaml.FullLoader)

 directory = parsed_config['output_path'] + parsed_config['project_name']
 os.makedirs(directory, exist_ok = True)

 print("Project Name and Input Directory:", parsed_config['input_path'] + parsed_config['project_name'])
 print("Project Name and Output Directory:", parsed_config['output_path'] + parsed_config['project_name'])
 print('''Please check required input data under the Input Directory:
 \t 1. Genome.txt with info about input genomes
 \t 2. phyloTree.txt with Newick Tree Structure
 \t 3. SynMap results between every pair of genomes
 ''')
 #print("Project input data directory:", parsed_config['input_file']['synteny_file_path'])
 #print("Input extant genomes info:", parsed_config['input_file']['leaf_genome_info'])
 #print("The postfix of SynMap output files:", parsed_config['input_file']['synteny_file_name'])
 ```

 %% Output

-    Project Name and Input Directory: ../inputData/project-monocots
-    Project Name and Output Directory: ../outputData/project-monocots
+    Project Name and Input Directory: ../inputData/project-buxus
+    Project Name and Output Directory: ../outputData/project-buxus
    Please check required input data under the Input Directory:
    	 1. Genome.txt with info about input genomes
    	 2. phyloTree.txt with Newick Tree Structure
    	 3. SynMap results between every pair of genomes
    

 %% Cell type:markdown id: tags:

 # Section 2: Constructing Gene Families from syntenically validated orthologs from SynMap
 In order to succesfully construct gene families, we require:
 - Successful parsing of the configuration file <br>
 - Valid parameters in the configuration file

 %% Cell type:markdown id: tags:

 ## Reading in Input Genome Data and Tree Structure Data
 Each genome is a leaf of the input phylogenetic tree. The *get_leaves_and_tree_info()* method extracts the following attribute of each genome from the input file specified by *leaf_genome_info* parameter in the configuration file.
 >1) genome ID; <br>
 >2) genome name; <br>
 >3) most recent ancestor; <br>
 >4) number of chromosomes; <br>
 >5) file name of the genome annotations; <br>
 >6) desired tree structure in Newick format.

 %% Cell type:code id: tags:

 ``` python
 # reading in input genome info
 gene_family = GeneFamily(parsed_config)
 all_leaves, median_structure, newick_structure = gene_family.get_leaves_and_tree_info()

 print("Extant genomes to be analyzed in this project: \n")

 genomes = pd.DataFrame(all_leaves)
 display = genomes.iloc[:, :3]

 display.columns = ['Genome ID', 'Genome Name', 'Ancestor']
 display.set_index('Genome ID', inplace = True, drop = True)

 display
 ```

 %% Output

    Confirm ancestors:
-    ['51364', '54711', '25734,33018,33908,51051']
-    ['51364,54711', '51051', '25734,33018,33908']
-    ['51364,54711,51051', '33908', '25734,33018']
-    ['25734', '33018', '33908,51051,51364,54711']
+    ['19990', '54057', '57266,57268,28620,35405']
+    ['57266', '57268', '54057,19990,28620,35405']
+    ['19990,54057', '57266,57268', '28620,35405']
+    ['28620', '35405', '54057,19990,57266,57268']
    
    Extant genomes to be analyzed in this project:
    

-              Genome Name Ancestor
+                Genome Name Ancestor
    Genome ID
-    25734          Ananas        4
-    33018          Elaeis        4
-    33908       Asparagus        3
-    51051       Dioscorea        2
-    51364       Spirodela        1
-    54711          Acorus        1
+    19990             Vitis        1
+    28620         Aquilegea        4
+    35405           Nelumbo        4
+    54057        Amaranthus        1
+    57266      Tetracentron        2
+    57268             Buxus        2

 %% Cell type:code id: tags:

 ``` python
 tree = Phylo.read(io.StringIO(newick_structure), "newick")

 matplotlib.rc('font', size=8)
 fig = plt.figure(figsize=(8, 5), dpi=100)
 axes = fig.add_subplot(1, 1, 1)
 axes.set_facecolor('white')

 Phylo.draw(tree, axes = axes)
 ```

 %% Output

-
+

 %% Cell type:markdown id: tags:

 ## Creating gene families from syntenically validated homology generated from SynMap

 Read in all genes for every genome, record gene-related information and use gf1 (from config file), cutoff weights in order to create gene families.

 %% Cell type:code id: tags:

 ``` python
 print(all_leaves)
 # creating gene families
 gene_family.make_gene_family(all_leaves)

 print(len(all_leaves))
 ```

 %% Output

    [['25734', 'Ananas', '4', '25'], ['33018', 'Elaeis', '4', '16'], ['33908', 'Asparagus', '3', '10'], ['51051', 'Dioscorea', '2', '21'], ['51364', 'Spirodela', '1', '20'], ['54711', 'Acorus', '1', '12']]
    6

 %% Cell type:markdown id: tags:

 ## Visualizing Gene Families
 In the configuration file *config.yaml*, there are three parameters to restrict the size of gene families:
 - gf1: the maximum number of genes in a gene family
 - gf2: the maximum number of genes from a single genome in a gene family
 - gf3: the minimum number of genomes in a gene family <br>

 %% Cell type:code id: tags:

 ``` python
 families = list(gene_family.gene_family.keys())

 values = list(gene_family.gene_family.values())
 lengths = [len(values[i]) for i in range(0, len(values))]

 print("Total number of gene familes:", len(families))
 print("The size of the largest gene family:", max(lengths))
 ```

 %% Output

-    Total number of gene familes: 8790
+    Total number of gene familes: 10236
    The size of the largest gene family: 49

 %% Cell type:code id: tags:

 ``` python
 sns.set(style="darkgrid")
 family_dict = {'label': families, 'lengths': lengths}
 pdfamily = pd.DataFrame(family_dict)

 fig=sns.histplot(data = pdfamily, x = "lengths", binwidth = 3).set(title = "Overview of Gene Families")

 plt.xlabel("Number of genes in a gene family")
 plt.ylabel("Frequency")
 plt.show(fig)
 ```

 %% Output

-
+

 %% Cell type:markdown id: tags:

 # Section 3: Representing Genomes by Gene Family Labels
 In order to succesfully separate the input data (genes) into respective chromosomes and genomes, we require:
 - Successful creation of gene families

 %% Cell type:markdown id: tags:

 ## Grouping Genes and their ordering by Chromosomes then Chromosomes by Genomes

 %% Cell type:code id: tags:

 ``` python
 # creating an instance of the Genome class
 initialize_genome = Genome(gene_family.gene_list, all_leaves, parsed_config)
 ```

 %% Cell type:code id: tags:

 ``` python
 # separating each genome's genes into chromosomes
 genome = initialize_genome.get_genome_in_string()
 ```

 %% Cell type:markdown id: tags:

 ## Visualizing Genes Families in Chromosomes of extant genomes

 %% Cell type:code id: tags:

 ``` python
 genomes, chromosomes, gene_lengths = [], [], []
 genome_lengths = []
 num_chr = []

 for key, value in genome.items():
    current_lengths = 0

    for genome_id in all_leaves:
        if int(genome_id[0]) == int(key):
            genome_name = genome_id[1]
            num_chromosomes = int(genome_id[3])
            num_chr.append(int(num_chromosomes))

    for i, chromosome in enumerate(value):
        genomes.append(genome_name)
        gene_lengths.append(len(chromosome))
        current_lengths += 1

    chromosomes.extend(range(1, num_chromosomes + 1))

    genome_lengths.append(current_lengths)

 #print(chromosomes)
 ```

 %% Cell type:code id: tags:

 ``` python
 length_index = 0

 # removing short scaffolds for visualization
 for i, lengths in enumerate(genome_lengths):
    left = length_index
    right = length_index + genome_lengths[i]

    gene_lengths[left:right] = sorted(gene_lengths[left:right], reverse = True)
    del gene_lengths[left + num_chr[i] : right]
    del genomes[left + num_chr[i] : right]

    length_index += num_chr[i]
 ```

 %% Cell type:code id: tags:

 ``` python
 print("Visualizing the number of gene families in each chromosome of each genome")

 data = {'genome': genomes, 'chromosome number': chromosomes, 'number of gene families': gene_lengths}

 pddata = pd.DataFrame(data)

 sns.set(style="darkgrid")

 g = sns.catplot(x = 'chromosome number', y = 'number of gene families', col = 'genome', data = pddata, kind="bar", aspect = .7, sharex = False, dodge = False)
 g.set_xticklabels(rotation = 30)
 ```

 %% Output

    Visualizing the number of gene families in each chromosome of each genome

    /Users/lij313/opt/anaconda3/lib/python3.8/site-packages/seaborn/categorical.py:3803: UserWarning: Setting `sharex=False` with `color=None` may cause different levels of the `x` variable to share colors. This will change in a future version.
      warnings.warn(msg.format("sharex", "x"), UserWarning)

-    <seaborn.axisgrid.FacetGrid at 0x7fc296913190>
+    <seaborn.axisgrid.FacetGrid at 0x7fd3b1ac57c0>



 %% Cell type:markdown id: tags:

 # Section 4: Extract gene proximity (or adjacencies) from each extant genome

 Identify generalized gene adjacencies with a user specified window/gap size. The size parameter (*ws*) is defined in the configuration file.
 In order to succesfully prepare adjacency data for the Maximum Weight Matching algorithm, we require:
 - Successful separation of input data

 %% Cell type:markdown id: tags:

 ## Collecting Gene Adjacencies for Maximum Weight Matching

 Adjacencies are extracted from gene orders in each extant genome. <br>
 A large number of (generalized) adjacencies is collected for each ancestor defined in *median_structure*.


 Number of edges are the same since the number of genes and window size is consistent for each ancestor. However, the weights will be different.

 %% Cell type:code id: tags:

 ``` python
 ### manually change the median structure to construct monoploid extant genome
 #median_structure=[['61837', '61837', '61837']]
 ```

 %% Cell type:code id: tags:

 ``` python
 # preparing data for maximum weight matching + maximum weight matching
 window_size = parsed_config['ws']
 min_adj_weight = parsed_config['min_mwm_weight']
 num_gene_families = len(gene_family.gene_family)

 input_tree_node = MWMInputTreeNode(genome, all_leaves)

 mwmin_output_template = parsed_config['output_file']['mwm_input_template']

 num_anc = 1
 mwm_inputs = [None] * len(median_structure)

 for i, structure in tqdm(enumerate(median_structure), total=len(mwm_inputs)):
    outfile_mwmin = directory + mwmin_output_template + str(num_anc) + ".txt"
    mwm_inputs[num_anc - 1] = input_tree_node.get_mwm_input(structure, num_gene_families, window_size, min_adj_weight, outfile_mwmin)
    num_anc += 1
 ```

 %% Output

-    100%|██████████| 4/4 [00:02<00:00,  1.53it/s]
+    100%|██████████| 4/4 [00:02<00:00,  1.50it/s]

 %% Cell type:markdown id: tags:

 # Section 5: Maximum Weight Matching
 In order to successfully complete the Maximum Weight Matching algorithm, we require:
 - Successful preparation of Maximum Weight Matching Input Data

 %% Cell type:markdown id: tags:

 ## Maximum Weight Matching

 %% Cell type:markdown id: tags:

 Produces the result of Maximum Weight Matching for each ancestor. <br>
 This step is computationally expensive. <br>
 Uses multiprocessing to speed up the process.

 %% Cell type:code id: tags:

 ``` python
 # The path of output file in general form
 mwm_output_template = parsed_config['output_file']['mwm_output_template']

 def mwm_matching(mwm_input, num_anc):
    outfile_mwmout = directory + mwm_output_template + str(num_anc) + ".txt"
    maxWeightMatching(mwm_input, outfile_mwmout)
 ```

 %% Cell type:code id: tags:

 ``` python
 # this step is slow
 # inputs for multiprocessing
 numbers = [i for i in range(1, num_anc)]
 inputs = zip(mwm_inputs, numbers)

 start = time.time()

 # multiprocessing to make workflow faster
 with tqdm_joblib.tqdm_joblib(tqdm(desc = "Maximum Weight Matching", total = len(mwm_inputs))) as progress_bar:
    Parallel(n_jobs=(num_anc - 1))(delayed(mwm_matching)(mwm_inputs[i], numbers[i]) for i in range (0, len(mwm_inputs)))

 end = time.time()
 ```

 %% Output

    Maximum Weight Matching:   0%|          | 0/4 [00:00<?, ?it/s]
      0%|          | 0/4 [00:00<?, ?it/s][A
+     25%|██▌       | 1/4 [11:27<34:21, 687.08s/it][A
+     50%|█████     | 2/4 [11:41<09:42, 291.39s/it][A
+     75%|███████▌  | 3/4 [11:49<02:42, 162.13s/it][A
+    100%|██████████| 4/4 [12:28<00:00, 187.20s/it][A

 %% Cell type:code id: tags:

 ``` python
 print("MWM solution was computed in %d minutes" % int((end - start)/60))
 ```

+%% Output
+
+    MWM solution was computed in 12 minutes
+
 %% Cell type:markdown id: tags:

 # Section 6: Constructing Ancestral Contigs
 In order to succesfully construct ancestral contigs, we require:
 - Successful Maximum Weight Matching on the prepared input data

 %% Cell type:markdown id: tags:

 ## Creating Contigs

 %% Cell type:markdown id: tags:

 Produces the contigs for each ancestor.

 %% Cell type:code id: tags:

 ``` python
 # creating contigs from the maximum weight matching output
 contig_template = parsed_config['output_file']['contig_template']

 # string to use for next step
 dcj_files = []

 for i in tqdm(range(1, num_anc), total = len(mwm_inputs)):
    outfile_mwmout = directory + mwm_output_template + str(i) + ".txt"
    outfile_contig = directory + contig_template + str(i) + ".txt"
    dcj_files.append(outfile_contig)

    contig = Contig(outfile_mwmout, int(num_gene_families))
    contig.get_edge()
    list_telomeres = contig.find_telomeres()

    contig_list = contig.get_contigs(list_telomeres, outfile_contig, str(i))
 ```

+%% Output
+
+    
+      0%|          | 0/4 [00:00<?, ?it/s][A
+     50%|█████     | 2/4 [00:00<00:00, 13.52it/s][A
+    100%|██████████| 4/4 [00:00<00:00, 14.02it/s][A
+
 %% Cell type:markdown id: tags:

 # Section 7: Calculating DCJ Distance Between Ancestors
 In order to succesfully calculate the DCJ Distance between ancestors, we require:
 - Successful creation of ancestral contigs

 %% Cell type:markdown id: tags:

 ## Calculating DCJ Distance

 %% Cell type:code id: tags:

 ``` python
 jar_path = parsed_config['input_file']['jar_path']

 dcj_output = []

 # Calculate DCJ Distance
 print("Starting DCJ Calculations, May Take a While")

 for i in tqdm(range(1, num_anc)):
    for j in range(i + 1, num_anc):
        dcj_output_path = directory + parsed_config['output_file']['dcj_output_path'] + str(i) + "_" + str(j) + ".txt"
        command = "java -jar " + jar_path + "/UniMoG-java11.jar " + str(dcj_files[i - 1]) + " " + str(dcj_files[j - 1]) + " -m=1 -p >" + dcj_output_path
        dcj_output.append(dcj_output_path)
        print(command)
        print("Computing DCJ distance between ancestors", i, "and", j)
        os.system(command)

 print("Calculation Done")
 ```

+%% Output
+
+    
+      0%|          | 0/4 [00:00<?, ?it/s][A
+
+    Starting DCJ Calculations, May Take a While
+    java -jar ./UniMoG-java11.jar ../outputData/project-monocots/contigAnc1.txt ../outputData/project-monocots/contigAnc2.txt -m=1 -p >../outputData/project-monocots/dcj_output1_2.txt
+    Computing DCJ distance between ancestors 1 and 2
+    java -jar ./UniMoG-java11.jar ../outputData/project-monocots/contigAnc1.txt ../outputData/project-monocots/contigAnc3.txt -m=1 -p >../outputData/project-monocots/dcj_output1_3.txt
+    Computing DCJ distance between ancestors 1 and 3
+    java -jar ./UniMoG-java11.jar ../outputData/project-monocots/contigAnc1.txt ../outputData/project-monocots/contigAnc4.txt -m=1 -p >../outputData/project-monocots/dcj_output1_4.txt
+    Computing DCJ distance between ancestors 1 and 4
+
+    
+     25%|██▌       | 1/4 [00:22<01:08, 22.83s/it][A
+
+    java -jar ./UniMoG-java11.jar ../outputData/project-monocots/contigAnc2.txt ../outputData/project-monocots/contigAnc3.txt -m=1 -p >../outputData/project-monocots/dcj_output2_3.txt
+    Computing DCJ distance between ancestors 2 and 3
+    java -jar ./UniMoG-java11.jar ../outputData/project-monocots/contigAnc2.txt ../outputData/project-monocots/contigAnc4.txt -m=1 -p >../outputData/project-monocots/dcj_output2_4.txt
+    Computing DCJ distance between ancestors 2 and 4
+
+    
+     50%|█████     | 2/4 [00:37<00:36, 18.02s/it][A
+
+    java -jar ./UniMoG-java11.jar ../outputData/project-monocots/contigAnc3.txt ../outputData/project-monocots/contigAnc4.txt -m=1 -p >../outputData/project-monocots/dcj_output3_4.txt
+    Computing DCJ distance between ancestors 3 and 4
+
+    
+    100%|██████████| 4/4 [00:44<00:00, 11.24s/it][A
+
+    Calculation Done
+
+    
+
 %% Cell type:code id: tags:

 ``` python
 dcj_summary = directory + parsed_config['output_file']['dcj_summary_path']
 dcj_matrix = []

 with open(dcj_summary, 'w') as dcj_summary_file:
    for i in range(len(median_structure)):
            dcj_summary_file.write("Median Structure for Ancestor " + str((i + 1)) + ": " + "%s" % median_structure[i] + "\n")

    for file in dcj_output:
        path = file

        with open(path, 'r') as dcj_file:
            dcj_info = dcj_file.readlines()

        dcj_summary_file.write(dcj_info[0])
        print(dcj_info[0])

        distance = int((dcj_info[0].split())[-1])
        dcj_matrix.append(distance)

 print("Summay of pairwise DCJ distance -- Done")
 ```

+%% Output
+
+    > "Ancestor 1" & "Ancestor 2" (DCJ) : 2572
+    
+    > "Ancestor 1" & "Ancestor 3" (DCJ) : 3146
+    
+    > "Ancestor 1" & "Ancestor 4" (DCJ) : 3505
+    
+    > "Ancestor 2" & "Ancestor 3" (DCJ) : 2478
+    
+    > "Ancestor 2" & "Ancestor 4" (DCJ) : 3128
+    
+    > "Ancestor 3" & "Ancestor 4" (DCJ) : 2483
+    
+    Summay of pairwise DCJ distance -- Done
+
 %% Cell type:code id: tags:

 ``` python
 matrix_indices = [("Ancestor " + str(i)) for i in range (1, num_anc)]
 dcj_matrix_summary = pd.DataFrame(squareform(dcj_matrix), index=matrix_indices, columns=matrix_indices)
 dcj_matrix_summary.style.set_caption("DCJ Distance Summary Symmetric Matrix")
 dcj_matrix_summary
 ```

+%% Output
+
+                Ancestor 1  Ancestor 2  Ancestor 3  Ancestor 4
+    Ancestor 1           0        2572        3146        3505
+    Ancestor 2        2572           0        2478        3128
+    Ancestor 3        3146        2478           0        2483
+    Ancestor 4        3505        3128        2483           0
+
+%% Cell type:code id: tags:
+
+``` python
+```
+
 %% Cell type:code id: tags:

 ``` python
 ```
No results found