pibase | pibase_to_rdf | example pibase_to_rdf.sh
The linux bash shell script below assumes that some pibase files have already been created using the example download data, in a preceeding pibase_test.sh run. It performs pibase_fisherdiff for three remaining pairs of files, and then invokes pibase_to_rdf which creates the rdf file.
################################################################ #### STEP 1 #### Create list files "trio.txt" and "trio_ref.txt" for pibase_to_rdf #### (These files are already included in the example data download) cat trio.txt dummy ILUNA12878 output/diff_cov5_gen_NA12878_NA12891_ILLUMINA.txt NA12891 output/diff_cov5_gen_NA12878_NA12892_ILLUMINA.txt NA12892 output/diff_cov5_gen_NA12878_ILLUMINA_SOLID.txt SOLNA12878 output/diff_cov5_gen_NA12878_ILLUMINA_FLX.txt FLXNA12878 cat trio_ref.txt dummy HG19REF output/diff_cov5_gen_ref_NA12878_ILLUMINA.txt NA12878 output/diff_cov5_gen_ref_NA12891_ILLUMINA.txt NA12891 output/diff_cov5_gen_ref_NA12892_ILLUMINA.txt NA12892 output/diff_cov5_gen_ref_NA12878_SOLID.txt SOLNA12878 output/diff_cov5_gen_ref_NA12878_FLX.txt FLXNA12878 ################################################################ #!/bin/bash #### STEP 2 #### prepare pibase_fisherdiff files of 5 HapMap runs of 3 individuals #### for Network analysis # compare two files at a time (normally a control/case pair, e.g. normal/tumor tissue) # Options: min coverage >= 5, p.median <= 0.05, factor <= 10 # compare daughter with her father pibase_fisherdiff output/gen_NA12878_ILLUMINA.txt output/gen_NA12891_ILLUMINA.txt output/diff_cov5_gen_NA12878_NA12891_ILLUMINA.txt 5 0.05 10 # compare daughter with her mother pibase_fisherdiff output/gen_NA12878_ILLUMINA.txt output/gen_NA12892_ILLUMINA.txt output/diff_cov5_gen_NA12878_NA12892_ILLUMINA.txt 5 0.05 10 # compare daughter with her SOLiD self pibase_fisherdiff output/gen_NA12878_ILLUMINA.txt output/gen_NA12878_SOLID.txt output/diff_cov5_gen_NA12878_ILLUMINA_SOLID.txt 5 0.05 10 # compare daughter with her FLX self pibase_fisherdiff output/gen_NA12878_ILLUMINA.txt output/gen_NA12878_FLX.txt output/diff_cov5_gen_NA12878_ILLUMINA_FLX.txt 5 0.05 10 #### STEP 3 #### Create test.p20n.rdf for illustrative purposes: #### using p <= 0.2 and no both-stranded confirmation pibase_to_rdf trio.txt output/trio.p20n.rdf 0.2 n # Version with eliminated "N"-columns and invariable columns pibase_to_rdf trio.txt output/trio.p20n.elim.rdf 0.2 n y #### STEP 4 #### Start Network 4.6.0.0 #### Calculate Network / Network Calculations / Median Joining #### File / Open : trio.p20n.rdf / Calculate network. ################################################################ ### trio.p20n.rdf IS THE DATA FILE USED FOR THE FIGURE AND THE ### MANUSCRIPT FOR DEMONSTRATION PURPOSES (AS THE SOLID'S COVERAGE WAS THIN). ### NETWORK 4.6.0.0 REPLACES 'N' (SEE BELOW) WITH 1/0 FROM THE ### CLOSEST SEQUENCE. ### FOR NETWORKS FROM DATA FROM THE SAME PLATFORM, WE RECOMMEND TO ### USE THE OPTION TO ELIMINATE Ns, AND TO TRY REFERENCE SAMPLE ALTERNATIVES ### (E.G. SEE BELOW) ### rdf file with "N"s and with invariable columns ("characters"): cat output/trio.p20n.rdf ;1.0 1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;19;20;21;22;23;24;25;26;27;28;29;30;31;32;33;34;35;36;37;38;39;40;41;42;43;44;45;46;47;48;49;50;51;52;53;54; 10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10; >ILUNA12878;1;;;;;;; 111111111111111111111111111111111111111111111111111111 >NA12891;1;;;;;;; 111110101111011111111111101011111111110111100111011111 >NA12892;1;;;;;;; 111111011111111010010011101111111011011101111110100011 >SOLNA12878;1;;;;;;; 1N1NNNNN1N1NNN11N1111111NNN1NNN110N111N10NN1N1NN1111NN >FLXNA12878;1;;;;;;; 111111111111111111101111111111111111111101111111N11111 ### rdf file where "N"s and invariable columns ("characters") were eliminated: cat output/trio.p20n.elim.rdf ;1.0 18;20;21;22;28;34;37;41;44;50;51;52; 10;10;10;10;10;10;10;10;10;10;10;10; >ILUNA12878;1;;;;;;; 111111111111 >NA12891;1;;;;;;; 111101110111 >NA12892;1;;;;;;; 010010001000 >SOLNA12878;1;;;;;;; 111110101111 >FLXNA12878;1;;;;;;; 101011101111 ##################################################################### #### ALTERNATIVE STEP 3: #### Use a "clean" reference sample generated by pibase_rdf_ref # generate this clean reference sample: pibase_rdf_ref output/gen_NA12878_ILLUMINA.txt output/gen_refsample.txt 100 # compare the 5 samples pair-wise against this clean reference sample pibase_fisherdiff output/gen_refsample.txt output/gen_NA12878_ILLUMINA.txt output/diff_cov5_gen_ref_NA12878_ILLUMINA.txt 5 0.05 10 pibase_fisherdiff output/gen_refsample.txt output/gen_NA12891_ILLUMINA.txt output/diff_cov5_gen_ref_NA12891_ILLUMINA.txt 5 0.05 10 pibase_fisherdiff output/gen_refsample.txt output/gen_NA12892_ILLUMINA.txt output/diff_cov5_gen_ref_NA12892_ILLUMINA.txt 5 0.05 10 pibase_fisherdiff output/gen_refsample.txt output/gen_NA12878_SOLID.txt output/diff_cov5_gen_ref_NA12878_SOLID.txt 5 0.05 10 pibase_fisherdiff output/gen_refsample.txt output/gen_NA12878_FLX.txt output/diff_cov5_gen_ref_NA12878_FLX.txt 5 0.05 10 # generate rdf file using p<=0.2 and no both-stranded validation to detect differences: pibase_to_rdf trio_ref.txt output/trio_ref.p20n.rdf 0.2 n # generate rdf file using p<=0.2 and no both-stranded validation to detect differences, and eliminate Ns and invariable characters: pibase_to_rdf trio_ref.txt output/trio_ref.p20n.elim.rdf 0.2 n y ################################################################ ### rdf file with "N"s and with invariable columns ("characters"): cat output/trio_ref.p20n.rdf ;1.0 1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;19;20;21;22;23;24;25;26;27;28;29;30;31;32;33;34;35;36;37;38;39;40;41;42;43;44;45;46;47;48;49;50;51;52;53;54; 10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10; >HG19REF;1;;;;;;; 111111111111111111111111111111111111111111111111111111 >NA12878;1;;;;;;; 111110001110011011001011101011101011010101111111000110 >NA12891;1;;;;;;; 111111011110011011001011111011101011010101100111100110 >NA12892;1;;;;;;; 111110001100011110000111101011101111010111111110011011 >SOLNA12878;1;;;;;;; 1N1NNNNN1N1NNN10N1011111NNN0NNN011N101N11NN1N1NN1001NN >FLXNA12878;1;;;;;;; 111110101110011011001011101011101011010111111111N00110 ### rdf file where "N"s and invariable columns ("characters") were eliminated: cat output/trio.p20n.elim.rdf ;1.0 11;16;18;19;20;21;22;28;32;34;37;41;44;50;51;52; 10;10;10;10;10;10;10;10;10;10;10;10;10;10;10;10; >HG19REF;1;;;;;;; 1111111111111111 >NA12878;1;;;;;;; 1010010000001001 >NA12891;1;;;;;;; 1010010000000001 >NA12892;1;;;;;;; 0100001001011110 >SOLNA12878;1;;;;;;; 1010111001011001 >FLXNA12878;1;;;;;;; 1010010000011001
^top