Ancestral Reconstruction Pipeline: Difference between revisions
No edit summary |
No edit summary |
||
Line 4: | Line 4: | ||
<br> | <br> | ||
<pre>#compile | <pre>#compile | ||
#gets gene pairs from SynMap output | #gets gene pairs from SynMap output | ||
Line 10: | Line 11: | ||
#run with config file | #run with config file | ||
#config file: number of genomes and syntenic depth relationships | #config file: number of genomes and syntenic depth relationships | ||
java TestGetGenomes data/inputInfoCoGe.txt | java TestGetGenomes data/inputInfoCoGe.txt | ||
#outputs from above | #outputs from above: | ||
# orthologSets_8400_9050_10997_19515.txt | |||
# This file contains orhtologous sets of genes across genome | |||
<pre> | |||
#ortholog_set org_1_paralog_1|org_1_paralog_2|org_1|paralog_3 org_2_paralog_1|etc. . | |||
#max number of paralog genes in an organism is based on the syntenic depth for that organism. E.g., three for peach | |||
1 PAC:17653852|PAC:17650319|PAC:17656076 GSVIVG01035853001|GSVIVG01017316001|GSVIVG01015543001 Tc06_g001640|Tc09_g002300|Tc09_g011400 evm_27.model.AmTr_v1.0_scaffold00078.25 | |||
2 PAC:17659187|PAC:17667521|PAC:17645335 GSVIVG01024935001|GSVIVG01034155001|GSVIVG01016522001 Tc05_g004430|Tc09_g031500|Tc10_g002720 evm_27.model.AmTr_v1.0_scaffold00012.209 | |||
3 PAC:17660832|PAC:17641120|PAC:17649248 GSVIVG01035855001|GSVIVG01017319001|GSVIVG01015546001 Tc06_g001660|Tc09_g002290|Tc09_g011430 evm_27.model.AmTr_v1.0_scaffold00078.24 | |||
.... | |||
10483 PAC:17654291 missing Tc02_g001110 missing | |||
</pre> | |||
# genomeInString_8400_9050_10997_19515.txt | |||
# subgenomeRangesInGeneOrder_8400_9050_10997_19515.txt | |||
javac TestGetContigInput.java | javac TestGetContigInput.java |
Revision as of 17:40, 24 April 2014
This page is to document the Ancestral Reconstruction Pipeline by Chunfang Zheng
Master control is from her batch script: batchFile.txt
#compile #gets gene pairs from SynMap output javac TestGetGenomes.java #run with config file #config file: number of genomes and syntenic depth relationships java TestGetGenomes data/inputInfoCoGe.txt #outputs from above: # orthologSets_8400_9050_10997_19515.txt # This file contains orhtologous sets of genes across genome <pre> #ortholog_set org_1_paralog_1|org_1_paralog_2|org_1|paralog_3 org_2_paralog_1|etc. . #max number of paralog genes in an organism is based on the syntenic depth for that organism. E.g., three for peach 1 PAC:17653852|PAC:17650319|PAC:17656076 GSVIVG01035853001|GSVIVG01017316001|GSVIVG01015543001 Tc06_g001640|Tc09_g002300|Tc09_g011400 evm_27.model.AmTr_v1.0_scaffold00078.25 2 PAC:17659187|PAC:17667521|PAC:17645335 GSVIVG01024935001|GSVIVG01034155001|GSVIVG01016522001 Tc05_g004430|Tc09_g031500|Tc10_g002720 evm_27.model.AmTr_v1.0_scaffold00012.209 3 PAC:17660832|PAC:17641120|PAC:17649248 GSVIVG01035855001|GSVIVG01017319001|GSVIVG01015546001 Tc06_g001660|Tc09_g002290|Tc09_g011430 evm_27.model.AmTr_v1.0_scaffold00078.24 .... 10483 PAC:17654291 missing Tc02_g001110 missing
- genomeInString_8400_9050_10997_19515.txt
- subgenomeRangesInGeneOrder_8400_9050_10997_19515.txt
javac TestGetContigInput.java java TestGetContigInput data/inputInfoAGRP.txt cd outputFiles python contigInput_8400_9050_10997_19515.py> contigOutput.txt cd .. javac TestGetContigOutputAndScaffoldInput.java java TestGetContigOutputAndScaffoldInput data/inputInfoAGRP.txt cd outputFiles python scaffoldInput1.py > scaffoldOutput1.txt python scaffoldInput2.py > scaffoldOutput2.txt python scaffoldInput3.py > scaffoldOutput3.txt python scaffoldInput4.py > scaffoldOutput4.txt python scaffoldInput5.py > scaffoldOutput5.txt python scaffoldInput6.py > scaffoldOutput6.txt python scaffoldInput7.py > scaffoldOutput7.txt cd .. javac TestScaffoldOutput.java java TestScaffoldOutput
inputInfo example file (describes input from CoGe)
#obvious numberOfGenomes 4 numberOfGenomePairs 9 #synmap output with correct syntenic depth 8400 9050 data/8400_9050.CDS-CDS.last.tdd10.cs0.filtered.dag.all.go_D20_g10_A5.aligncoords.Dm0.ma1.qac3.3.40.gcoords 10997 8400 data/10997_8400.CDS-CDS.last.tdd10.cs0.filtered.dag.all.go_D20_g10_A5.aligncoords.Dm0.ma1.qac3.3.40.gcoords 10997 9050 data/10997_9050.CDS-CDS.last.tdd10.cs0.filtered.dag.all.go_D20_g10_A5.aligncoords.Dm0.ma1.qac3.3.40.gcoords 10997 19515 data/10997_19515.CDS-CDS.last.tdd10.cs0.filtered.dag.all.go_D20_g10_A5.aligncoords.Dm0.ma1.qac3.1.40.gcoords 19515 8400 data/19515_8400.CDS-CDS.last.tdd10.cs0.filtered.dag.all.go_D20_g10_A5.aligncoords.Dm0.ma1.qac1.3.40.gcoords 19515 9050 data/19515_9050.CDS-CDS.last.tdd10.cs0.filtered.dag.all.go_D20_g10_A5.aligncoords.Dm0.ma1.qac1.3.40.gcoords 8400 8400 data/8400_8400.CDS-CDS.last.tdd10.filtered.dag.all.go_D20_g10_A5.aligncoords.Dm0.ma1.qac3.3.40.gcoords 9050 9050 data/9050_9050.CDS-CDS.last.tdd10.filtered.dag.all.go_D20_g10_A5.aligncoords.Dm0.ma1.qac3.3.40.gcoords 10997 10997 data/10997_10997.CDS-CDS.last.tdd10.filtered.dag.all.go_D20_g10_A5.aligncoords.Dm0.ma1.qac3.3.40.gcoords #syntenic depth among the genomes #peach 8400 3 #grape 9050 3 Cacao 10997 3 #amborella 19515 1 #subgenome information data/subGenomeRegions.txt
SubGenomeRegions.txt
This file contains infromation about subgenomes (parental genomes) making up an extant genome. Chunfang often creates these by hand, but does have a program to generate this. Practical_Aliquoting
#genome_ID number_of_synteny_blocks paleopolyploid_depth title_for_set 10997 21 3 cacao #colorCode: means ancestral chromosome assignment -- better term is bin. For eudicots, this is thought to be 7 (could be other numbers for other reconstructions) #subgenome: which subgenome to which a block belongs #chr start end: position of block in extant genome colorCode subgenome chr start end 1 1 2 12716774 27462648 1 2 4 349021 14314443 1 3 3 208385 16091087 2 1 3 19982135 24212437 2 2 1 27207631 30674661 2 3 3 16741484 19970692 3 1 2 1350572 7237080 3 2 1 315357 7988483 3 3 8 43353 6712481 4 1 9 739576 3437504 4 2 6 1071819 9467758 4 3 9 3437504 9589803 5 1 4 18966341 23343107 5 2 1 21083375 26683534 5 3 5 23329957 25395907 6 1 9 23851693 28019603 6 2 5 541440 5362779 6 3 10 333882 12953021 7 1 6 10864133 14795052 7 2 1 8722499 15224371 7 3 7 511932 6542889 19515 0 1 amborella colorCode subgenome chr start end