[笔记]sam2bed

sam2bed.pl
 1 #!/usr/bin/perl -w
 2 #
 3 ########################################
 4 #
 5 # File Name:
 6 #   sam2bed.pl
 7 # 
 8 # Description:
 9 #   
10 # 
11 # Usage:
12 #   
13 # 
14 # Author:
15 #   Xi Wang, wang-xi05@mails.thu.edu.cn
16 # 
17 # Date:
18 #   Mon Nov  2 23:08:26 CST 2009
19 #
20 ########################################
21 
22 use strict;
23 my $usage = "$0 <.sam> <.bed> [.splice.bed]\n";
24 my $infile = shift || die $usage;
25 my $outfile = shift || die $usage;
26 my $splice = 0;
27 my $outfileSplice;
28 if ($outfileSplice = shift)
29 {
30   $splice = 1;
31 }
32 open(IN, $infile) || die "Can't open $infile for reading!\n";
33 open(OUT, ">$outfile") || die "Can't open $outfile for writing!\n";
34 if ($splice) {
35   open(OUTS, ">$outfileSplice") || die "Can't open $outfile for writing!\n";
36 }
37 
38 my @col;
39 my ($chr, $s, $e, $str);
40 my $len;
41 my $mm;
42 my $i;
43 
44 while(<IN>)
45 {
46     next if /^\@/;
47     chomp;
48     @col = split;
49     $chr = $col[2];
50     next if ($chr =~ /\*/);
51     $s = $col[3] - 1;
52     $len = length($col[9]);
53     $e = $s + $len;
54     $str = $col[1] & 16;
55     $str =~ s/0/+/;
56     $str =~ s/16/-/;
57     /NM:i:(\d)/;
58     $mm = $1;
59     if ($col[5] =~ /N/) # splice reads
60     {
61       if ($splice)
62       {
63         my ($lengths, $starts, $n);
64         $starts = "0,";
65         my $tmp;
66         my @subcol = split /[NM]/, $col[5];
67         $lengths = "$subcol[0],";
68         $n = 1;
69         $len = $subcol[0];
70         for ($i=1; $i<@subcol; $i = $i + 2 )
71         {
72           $tmp = $len + $subcol[$i];
73           $starts = "$starts$tmp,";
74           if ($i + 1 > @subcol)
75           {
76             die "error read!\n";
77           }
78           $lengths = "$lengths$subcol[$i+1],";
79           $n ++;
80           $len = $len + $subcol[$i] + $subcol[$i+1];
81           #print "$lengths\t$starts\n";
82         }
83         $e = $s + $len;
84         print OUTS "$chr\t$s\t$e\tU$mm\t0\t$str\t-\t-\t-\t$n\t$lengths\t$starts\n";
85       }
86     }
87     print OUT "$chr\t$s\t$e\tU$mm\t0\t$str\n";
88 }
89 
90 close IN;
91 close OUT;
92 if ($splice)
93 {
94   close OUTS;
95 }

 

值得留意的是,SAM格式文件是1-based cordination system,即最初碱基位置为;

而bed格式,则是0-based。

来自于 Xi Wang, wang-xi05@mails.thu.edu.cn

posted @ 2012-07-16 18:43  Puriney  阅读(514)  评论(0)    收藏  举报