#! /usr/local/bin/perl # mapdist_boot.pl: bootstraps estimates of mapdistance by calling # map_boot.pl to subsample loci, and mapmaker to generate linkage tables # Steve DiFazio, 11-2002 use Getopt::Long; use Stats::Basic; GetOptions( "c:i" => \$cycles, "f:s" => \$file, "l:i" => \$loci, "r" => \$repuls, "s:i" => \$score); if(!$ARGV[0] ) { die " _______________________________________________________________\n Usage: perl mapdist_boot.pl -r -f= -s= -l= -c= > out.txt \n where \t is an optional configuration file which allows batch processing. \tformat: distance loci cycles \t-r allows processing of markers in coupling and repulsion phase, \t is the number of loci to pull out of file (default 200), \t is the number of times dataset is to be sampled (default 100) \t is the cutoff distance for linkage (default 35) \t is mapmaker input file with all genotype info. \t is the name of the file to save the output\n Linkage determined from LOD scores only. _______________________________________________________________\n\n"; } warn "\n"; my ($m1,$m2,$th,$lod,$lod,$links); $input=pop(@ARGV); # create bootstrap data set if(!(opendir(BOOT,"bootstrap"))) { system("mkdir bootstrap"); } #configuration file input if($file) { open(CONFIG, $file) or die "$file not found\n"; while () { if(!(/^\#/)) { ($score,$dist,$loci,$cycles) = split(); if($loci) { $dist{$score} = $dist; $cycles{$score.":".$loci} = $cycles; if($cycles > $maxcycles{$loci}) { $maxcycles{$loci} = $cycles; } } } } } # command line input else { if(!($cycles)) { $cycles = 100; } if(!($loci)) { $loci = 200; } if(!($score)) { $score = 35; } $maxcycles{$loci} = $cycles; $cycles{$score.":".$loci} = $cycles; } foreach $locus (keys(%maxcycles)) { foreach $j (1 .. $maxcycles{$locus}) { my $output = "bootstrap/$input.$locus.$j"; # if file doesn't exist, create it if(!(-e $output)) { $command = "map_boot.perl -l=".$locus.($repuls ? " -r " : " ").$input." > ".$output; warn "Running $command\n"; system($command);# or die "some problem running $command\n"; } } } print "Rep\tLod\tDist\tLoci\tReps\tLinked\tUncorr Genome\tVar\tSD\tCorr Genome\tVar\tSD\n"; foreach my $scoreloc (sort(keys(%cycles))) { my $mlod = new Stats::Basic; my $mapdist = new Stats::Basic; my $cmapdist = new Stats::Basic; my ($score,$loc) = split(/:/,$scoreloc); foreach my $cycle (1 .. $cycles{$scoreloc}) { local (*MAPIN); $mminput = $input."mapin.txt"; open(MAPIN, ">",$mminput) or die "can't create $mminput\n"; my $mapinfile = "bootstrap/".$input.".".$loc.".".$cycle; # warn "Processing $mapinfile\n"; my $mapdatfile = "bootstrap/".$input.".".$loc; printf(MAPIN "pd %s\nld %s\nseq all\nlod table\n", $mapinfile,$mapdatfile); # $mapout = $output.".lod".$score; # system("mapmaker < mapin.txt > $mapout"); my @mapout = `mapmaker < $mminput`; $links=0; foreach $j (0 .. @mapout) { if(($mapout[$j] =~ /^\s+[0-9]/) && ($mapout[$j] !~ />/)) { ($n,@lod) = split(/\s+/,$mapout[$j]); # print "score: $score $mapout[$j]"; foreach my $i (0 .. $#lod) { if(($lod[$i] ne "-") && ($lod[$i] >= $score)) { # print "$i $lod[$i] $score\n"; $links++; } } } } # divide linkages by 2 to account for repulsion phase $links/= ($repuls ? 2 : 1); if($links) { $mlod->AddData($links); $genomelength = ($loc*($loc-1))*$dist{$score}/($links); $mapdist->AddData($genomelength); $num = 1-((2*19*$links)/($loc*($loc-1))); if($num > 0) { $cgenomelength = (($loc*($loc-1))*$dist{$score}/(2*$links))*(1+sqrt($num)); $cgenome=sprintf("%.2f",$cgenomelength); $cmapdist->AddData($cgenomelength); } else { $cgenome = "undef"; warn "Could not calculate corrected genome length for lod $score $loc loci rep $cycle\n";; } print "$cycle\t$score\t$dist{$score}\t$loc\t1\t",$links,"\t",; printf("%.2f\t\t\t%s\n", $genomelength,$cgenome); } else { $genomelength = 0; warn "No linkage for $score $loc $input\n"; } } # @temp = $mlod->GetData; # foreach my $i (0 .. @temp) { print "$temp[$i]\t"; } # print "\n"; print "Total\t$score\t$dist{$score}\t$loc\t",$mlod->Count(),"\t",$mlod->Mean(),"\t"; printf("%.2f\t%.2f\t%.2f\t", $mapdist->Mean(),$mapdist->Variance(),sqrt($mapdist->Variance())); printf("%.2f\t%.2f\t%.2f\n", $cmapdist->Mean(),$cmapdist->Variance(),sqrt($cmapdist->Variance())); }