#! /usr/local/bin/perl # map_recomb_boot.perl: calculate number of recombinations per linkage # group and individual. Takes haplotype file as argument in mapmaker # format, more or less, with some data prepended. Bootstraps in # stratified fashion based on number of markers in lg # Steve DiFazio, 5-2003 use Getopt::Long; use Stats::Basic; GetOptions( "c:i" => \$cycles, "l:i" => \$samploci, "a" => \$allloc); if(!$ARGV[0] ) { die " _______________________________________________________________\n Usage: perl map_recomb_boot.pl -l= -c= -a > out.txt \n where \tformat: distance loci cycles \t is the number of loci to pull out of file (default 200), \t is the number of times dataset is to be sampled (default 100) \t-a results in bootstraps including all loci, not just 1 locus per bin, \t is input file with all genotype info after locus,lg, and pos. \t is the name of the file to save the output\n _______________________________________________________________\n\n"; } warn "\n"; if(!($samploci)) { $samploci = 200; } if(!($cycles)) { $cycles = 100; } $infile = $ARGV[0]; while(<>) { if(/^\#/) { ($a,$b,$c,@prog)=split(/\t/); $header = $a."\t".$b."\t".$c; foreach $i (0 .. $#prog) { if($prog[$i]) { $header .= "\t".$prog[$i]; } } } else { chomp(); if($. > 0) { my ($loc,$lg,$pos,@genos)=split(/\t/); if($#genos ne $#prog) { die "ERROR: $#genos genos and $#prog progeny at line $. of $ARGV\n"; } if($loc) { # only framework markers used, so count each position only once unless -a option used if(!($counted{$lg.":".$pos}) || $allloc) { $totloc++; $loci{$lg}++; $counted{$lg.":".$pos} = 1; } #prepend asterisk so map_boot.pl will work if($loc !~ /^\*/) { $loc = "*".$loc; } my $genotype = $loc."\t".$lg."\t".$pos;; foreach my $i (0 .. $#genos) { if($genos[$i] ne "-") { $genotypes{$genos[$i]} = 1; } $genotype .= "\t".$genos[$i]; } push(@{$genos{$lg}},$genotype); } } } } # create bootstrap data directory if(!(opendir(BOOT,"bootstrap"))) { system("mkdir bootstrap"); } # save lg data to separate files for individual bootstrapping foreach my $lg (sort(keys(%loci))) { #number of loci to sample per lg $sample{$lg} = sprintf("%.0f",$samploci*$loci{$lg}/$totloc); if($sample{$lg} > $loci{$lg}) { $sample{$lg} = $loci{$lg}; } my $datfile = "bootstrap/".$infile.".".$lg.".txt"; if(!(-e $datfile)) { open(LG, ">",$datfile) or die "problem opening $datfile\n"; printf (LG "%s",$header); foreach $i (0 .. $#{$genos{$lg}}) { printf (LG "%s\n",$genos{$lg}[$i]); } } foreach $i (1 .. $cycles) { my $bootfile = "bootstrap/".$infile.".".$lg.".".$samploci.".".$i.".txt"; if(!(-e $bootfile)) { $command = "map_boot_frame.perl -l=".$sample{$lg}." ".$datfile; if($allloc) { $command .= " -a"; } $command .= " > ".$bootfile; warn "Running $command\n"; system($command);# or die "some problem running $command\n"; } } } #establish stat objects for overall calcs foreach my $lg (sort(keys(%loci))) { $recomblg{$lg} = new Stats::Basic; foreach my $gen (keys(%genotypes)) { $blocklg{$gen.":".$lg} = new Stats::Basic; } } $genrecomb = new Stats::Basic; foreach my $gen (keys(%genotypes)) { $genblock{$gen} = new Stats::Basic; } foreach $j (1 .. $cycles) { # for getting total recombination my $totrecomb = new Stats::Basic; foreach my $gen (keys(%genotypes)) { $totblock{$gen} = new Stats::Basic; } my (@totrecombin,%prevgeno,%prevpos,%block); foreach my $lg (sort(keys(%loci))) { $recomb{$lg} = new Stats::Basic; foreach my $gen (keys(%genotypes)) { $mblock{$gen.":".$lg} = new Stats::Basic; } my $bootfile = "bootstrap/".$infile.".".$lg.".".$samploci.".".$j.".txt"; my ($tricho,$tottricho) = 0; my (@recombin); open(BOOT, $bootfile) or die "ERROR: $bootfile not found\n"; while() { if(/^\#/) { ($loc,$tlg,$pos,@prog) = split(/\t/); } else { ($loc,$tlg,$pos,@genos) = split(/\t/); if($#genos != $#prog) { die "ERROR: $#genos Genotypes and $#prog progeny at line $. in $bootfile\n"; } foreach $i (0 .. $#genos) { if($genos[$i] eq "-") { next; } else { $markers{$lg}[$i]++; $totmarkers[$i]++; if($genos[$i] eq "H") { $tricho[$i]++; $tottricho[$i]++; } if($prevgeno{$lg}[$i]) { if($genos[$i] ne $prevgeno{$lg}[$i]) { $block{$genos[$i].":".$lg}[$i] += ($pos - $prevpos{$lg}[$i])/2; $block{$prevgeno[$i].":".$lg}[$i] += ($pos - $prevpos{$lg}[$i])/2; $block{$genos[$i]}[$i] += ($pos - $prevpos{$lg}[$i])/2; $block{$prevgeno[$i]}[$i] += ($pos - $prevpos{$lg}[$i])/2; $recombin[$i]++; $totrecombin[$i]++; $prevgeno{$lg}[$i] = $genos[$i]; } else { $block{$genos[$i].":".$lg}[$i] += ($pos - $prevpos{$lg}[$i]); $block{$genos[$i]}[$i] += ($pos - $prevpos{$lg}[$i]); } } else{ $prevgeno{$lg}[$i] = $genos[$i]; $block{$genos[$i].":".$lg}[$i] += ($pos - $prevpos{$lg}[$i]); $block{$genos[$i]}[$i] += ($pos - $prevpos{$lg}[$i]); } $prevpos{$lg}[$i] = $pos; } } } } foreach $i (0 .. $#genos) { if(!($recombin[$i])) { $recombin[$i] = 0; } $recomb{$lg}->AddData($recombin[$i]); foreach my $gen (keys(%genotypes)) { if(!($block{$gen.":".$lg}[$i])) { $block{$gen.":".$lg}[$i] = 0; }; $mblock{$gen.":".$lg}->AddData($block{$gen.":".$lg}[$i]); } } $mean = $recomb{$lg}->Mean(); $recomblg{$lg}->AddData($mean); foreach my $gen (keys(%genotypes)) { $mean=$mblock{$gen.":".$lg}->Mean(); $blocklg{$gen.":".$lg}->AddData($mean); } } # mean genome-wide recombination for all progeny foreach $i (0 .. $#genos) { if(!($totrecombin[$i])) { $totrecombin[$i] = 0; } $totrecomb->AddData($totrecombin[$i]); foreach my $gen (keys(%genotypes)) { if(!($block{$gen}[$i])) { $block{$gen}[$i] = 0; } $totblock{$gen}->AddData($block{$gen}[$i]); } } # add mean for all progeny to object for mean among cycles $mean = $totrecomb->Mean(); $genrecomb->AddData($mean); foreach my $gen (keys(%genotypes)) { $mean = $totblock{$gen}->Mean(); $genblock{$gen}->AddData($mean); } } print "LG\tLoci\tReps\tMean Recomb\tVar\tMaxRecomb\t"; foreach my $gen (keys(%genotypes)) { print "$gen Block\tVar\t"; } print "\n"; foreach my $lg (sort(keys(%loci))) { print "$lg\t$samploci\t",$recomblg{$lg}->Count(),"\t"; printf("%.2f\t%.4f\t%.2f\t", $recomblg{$lg}->Mean(),$recomblg{$lg}->Variance(),$recomblg{$lg}->Max()); foreach my $gen (keys(%genotypes)) { printf("%.2f\t%.4f\t", $blocklg{$gen.":".$lg}->Mean(),$blocklg{$gen.":".$lg}->Variance()); } print "\n"; } print "Total\t$samploci\t",$genrecomb->Count(),"\t"; printf("%.2f\t%.4f\t%.2f\t", $genrecomb->Mean(),$genrecomb->Variance(),$genrecomb->Max()); foreach my $gen (keys(%genotypes)) { printf("%.2f\t%.2f\t", $genblock{$gen}->Mean(),$genblock{$gen}->Variance()); } print "\n"; # print "LG\tReps\t"; # foreach $i (0 .. $#prog) { # print "$prog[$i]\t"; # } # print "Mean\tSE\n" # foreach my $lg (sort(keys(%loci))) { # foreach $j (1 .. $cycles) { # print "$lg\t$j\t"; # foreach $i (0 .. $#genos) { # print $recomb{$lg}[$j][$i] ? $recomb{$lg}[$j][$i] : "0","\t"; # } # print "\n"; # } # } sub by_num { $a <=> $b; }