linux 中shell统计fasta文件中每条染色体中的碱基数目
1、测试数据
[root@centos7 test]# ls test.fa [root@centos7 test]# cat test.fa >chr1 addgg ddges df >chr2 ertfg sdf >chr3 edret dfdff sfdfd d >chr4 iejie sdgg
2、在末尾生成一个标记染色体
[root@centos7 test]# sed -i '$a >chrxxx' test.fa [root@centos7 test]# cat test.fa >chr1 addgg ddges df >chr2 ertfg sdf >chr3 edret dfdff sfdfd d >chr4 iejie sdgg >chrxxx
3、生成循环配置文件
[root@centos7 test]# grep ">chr" test.fa | sed '$d' | paste - <(grep ">chr" test.fa | sed '1d') >chr1 >chr2 >chr2 >chr3 >chr3 >chr4 >chr4 >chrxxx [root@centos7 test]# grep ">chr" test.fa | sed '$d' | paste - <(grep ">chr" test.fa | sed '1d') > conf.txt [root@centos7 test]# ls conf.txt test.fa [root@centos7 test]# cat conf.txt >chr1 >chr2 >chr2 >chr3 >chr3 >chr4 >chr4 >chrxxx
4、统计每条染色体上的染色体数目
[root@centos7 test]# ls conf.txt test.fa [root@centos7 test]# cat conf.txt >chr1 >chr2 >chr2 >chr3 >chr3 >chr4 >chr4 >chrxxx [root@centos7 test]# cat conf.txt | while read {i,j}; do sed -n "/$i/,/$j/{/$i\|$j/b; p}" test.fa | sed 's/[\t ]*//g' | paste -d "" -s | awk -v a=$i '{print a, length}' >> result.txt; done [root@centos7 test]# ls conf.txt result.txt test.fa [root@centos7 test]# cat result.txt ## 查看结果 >chr1 12 >chr2 8 >chr3 16 >chr4 9 [root@centos7 test]# cat test.fa ## 验证结果 >chr1 addgg ddges df >chr2 ertfg sdf >chr3 edret dfdff sfdfd d >chr4 iejie sdgg >chrxxx
5、或者
[root@centos7 test]# ls test.fa [root@centos7 test]# awk '/^>/{if (l!="") print l; print; l=0; next}{l+=length($0)}END{print l}' test.fa >chr1 12 >chr2 8 >chr3 16 >chr4 9 [root@centos7 test]# cat test.fa ## 验证结果 >chr1 addgg ddges df >chr2 ertfg sdf >chr3 edret dfdff sfdfd d >chr4 iejie sdgg

浙公网安备 33010602011771号