[笔记]sam2bed
sam2bed.pl
1 #!/usr/bin/perl -w 2 # 3 ######################################## 4 # 5 # File Name: 6 # sam2bed.pl 7 # 8 # Description: 9 # 10 # 11 # Usage: 12 # 13 # 14 # Author: 15 # Xi Wang, wang-xi05@mails.thu.edu.cn 16 # 17 # Date: 18 # Mon Nov 2 23:08:26 CST 2009 19 # 20 ######################################## 21 22 use strict; 23 my $usage = "$0 <.sam> <.bed> [.splice.bed]\n"; 24 my $infile = shift || die $usage; 25 my $outfile = shift || die $usage; 26 my $splice = 0; 27 my $outfileSplice; 28 if ($outfileSplice = shift) 29 { 30 $splice = 1; 31 } 32 open(IN, $infile) || die "Can't open $infile for reading!\n"; 33 open(OUT, ">$outfile") || die "Can't open $outfile for writing!\n"; 34 if ($splice) { 35 open(OUTS, ">$outfileSplice") || die "Can't open $outfile for writing!\n"; 36 } 37 38 my @col; 39 my ($chr, $s, $e, $str); 40 my $len; 41 my $mm; 42 my $i; 43 44 while(<IN>) 45 { 46 next if /^\@/; 47 chomp; 48 @col = split; 49 $chr = $col[2]; 50 next if ($chr =~ /\*/); 51 $s = $col[3] - 1; 52 $len = length($col[9]); 53 $e = $s + $len; 54 $str = $col[1] & 16; 55 $str =~ s/0/+/; 56 $str =~ s/16/-/; 57 /NM:i:(\d)/; 58 $mm = $1; 59 if ($col[5] =~ /N/) # splice reads 60 { 61 if ($splice) 62 { 63 my ($lengths, $starts, $n); 64 $starts = "0,"; 65 my $tmp; 66 my @subcol = split /[NM]/, $col[5]; 67 $lengths = "$subcol[0],"; 68 $n = 1; 69 $len = $subcol[0]; 70 for ($i=1; $i<@subcol; $i = $i + 2 ) 71 { 72 $tmp = $len + $subcol[$i]; 73 $starts = "$starts$tmp,"; 74 if ($i + 1 > @subcol) 75 { 76 die "error read!\n"; 77 } 78 $lengths = "$lengths$subcol[$i+1],"; 79 $n ++; 80 $len = $len + $subcol[$i] + $subcol[$i+1]; 81 #print "$lengths\t$starts\n"; 82 } 83 $e = $s + $len; 84 print OUTS "$chr\t$s\t$e\tU$mm\t0\t$str\t-\t-\t-\t$n\t$lengths\t$starts\n"; 85 } 86 } 87 print OUT "$chr\t$s\t$e\tU$mm\t0\t$str\n"; 88 } 89 90 close IN; 91 close OUT; 92 if ($splice) 93 { 94 close OUTS; 95 }
值得留意的是,SAM格式文件是1-based cordination system,即最初碱基位置为;
而bed格式,则是0-based。
来自于 Xi Wang, wang-xi05@mails.thu.edu.cn

浙公网安备 33010602011771号