Saturday, November 21, 2015

Language: Shell/perl: FASTA Manipulations............

################################################## 
Count the number of fasta sequences in the file
grep ">" file | wc -l

#To grab the headers
awk < seq.fasta '/^>/ { print $0 }'

#Extract sequences from fasta
awk < seq.fasta '$0 !~ /^>/ { print $0 }'

#Deletes odd number lines(header) 
perl -ne 'print if $. % 2 == 0' file.fa > header.fa

################################################## 
To BLAST a query sequence with a subject sequence. It is blastn because the sequences are nucleotide sequences. Had it been protein, it would have been blastp.  Here the query is a a small sequence and subject is a whole genome. Output is result_IS1070.fasta
blastn -query  IS1070.fasta  -subject isolate.fasta   > result_IS1070.fasta
################################################## 

To search the word in the input  file and count its occurrences. Command grep will search the word whole wc will count the frequency.
grep 'Strand' result_IS1070.fasta | wc
################################################## 
Above two commands can be combined
blastn -query  IS1070.fasta -subject isolate.fasta | grep 'Strand'| wc 

To find the occurrence in plus strand or minus strand
blastn -query  IS1070.fasta -subject isolate.fasta > result_IS1070.fasta
grep "Strand" result_IS1070.fasta |grep "Strand=Plus/Plus" | wc -l

blastn -query  IS1070.fasta -subject isolate.fasta > result_IS1070.fasta
grep "Strand" result_IS1070.fasta |grep "Strand=Plus/Minus" | wc -l
###################################################
 #To find length of each fasta sequence
cat /home/pseema/hypothetical_analysis/protein_fasta_files/H37Rv.faa | awk '$0 ~ ">" {print c; c=0;printf substr($0,2,100) "\t"; } $0 !~ ">" {c+=length($0);} END { print c; }'
###################################################
I wanted to fragment my fasta file into files containing individual fasta sequences. This code did the job in an instant, though it can be messy for huge number of sequences. My input file was IS_elements.fasta.
csplit -z  IS_elements.fasta '/>/' '{*}'

A perl script can do it and split the multi sequence fasta file into desired number of small files.
perl split_fasta.pl input_file
perl split_fasta.pl -i input_file -o output_file - 50
###################################################
To find the number of CDS in the annotated genome
echo "Total number of CDS is : "
grep  ">" genome.faa | wc -l > total_number_CDS
x=$(grep  ">" genome.faa | wc -l)
echo "value of x is: "
echo $x

To find the number of transposase genes in the annotated genome
echo "Number of transposase genes is : "
grep  "transposase " genome.faa  | wc -l
grep  "transposase " genome.faa  | wc -l > only_transposase
y=$(grep  "transposase " genome.faa  | wc -l)
echo "value of y is: "
echo $y

Percentage calculation for the particular type of gene 
percent=$((100*$y/$x))
echo "Percent of transposase genes is: "
echo $percent
###################################################
To find the minimum fasta length among given sequences
echo "The minimum  length is : "
cut -f1 -d"," fasta_file |  head -1

To find the maximum fasta length among given sequences
echo "The maximum  length is : "
cut -f1 -d"," fasta_file |  tail -1
###################################################
To find the number of proteins with length 50 or below
echo "The numbers of proteins with length 50 or below are : "
cat fasta_file | awk '{if($1==$1+0 && $1<=50) print $1}' | sort -n | wc -l
z=$(cat fasta_file | awk '{if($1==$1+0 && $1<=50) print $1}' | sort -n | wc -l)
echo "value of z is: "
echo $z

Calculation of the percentage of protein satisfying above condition
percentage=$((100*$z/$y))
echo "Percent of proteins with length less than 100aa is: "
echo $percentage
###################################################
To search the frequency of  genomic features e.g. CRISPR and tRNA count in the log_file. The command grep is very effective for pattern search.
grep  "CRISPR" log_file
grep  "tRNAs" log_file
###################################################
Add a column to a file. Here a the field 1 from small_file is added to bib_file which will be output to merged_file
awk '{print $1}' < small_file | big_file - > merged_file

To sort a column based on a column . Here sorting based on field 4 and 5
sort -k 4,4 -k 5,5 input_file > sorted_file
###################################################
Find all entries with # symbol on both flanking sides
grep "^#" filename > file_with_hashtag
wc -l file_with_hashtag

Globally replace all # symbols with null
sed 's/#//g' file_with_hashtag > stripped_file
cat stripped_file

Extract column 1 from a file and paste it to a file with columns with merged file as output
awk '{print $1}' < file1 | paste file2 - > merged_file
###################################################
Find the pattern and print the line next to it. Here the pattern searched is 'Smallest protein' so next line is the length of the smallest protein. This code can be used to find and print the largest protein as well
perl -ne 'if ($p) { print; $p = 0 } $p++ if /Smallest protein/' protein_list > smallest_protein 
sort -n smallest_protein > sorted_smallest_protein
###################################################
To add legend to the data file (when there are many columns but no header, this code can add headers)
{ printf  'ISOLATE\tName\tLineage\tCountry\tRibonuclease\n'; cat filename; } > file_with_legend
###################################################
Alignment of data to legends for easy read (Its useful when column has shifted away from the respective header)
column -t filename > aligned_file && mv aligned_file  filename
###################################################
To extract only headers of the sequences
grep "^>" fasta_file > only_header |head
###################################################
#To extract multiple columns, suppose 2,3,4,5,6 
awk  '{print $2 $3 $4 $5 $6}' only_header > extracted_columns 
###################################################
To sort alphabetically
sort -u fasta_file > sorted
###################################################
To convert a multiline fasta sequence into single line (it makes working convenient)
awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);}  END {printf("\n");}' < fasta_file > single_seq_fasta
###################################################
To find the fasta sequences without the specific term in its header, suppose without 'toxin'.
awk '!/toxin/' fasta_file > seq_without_toxin
###################################################
This code will find the fasta sequence of the particular length, suppose we want to find the fasta sequence with length 32aa 
awk '!/^>/ { next } { getline seq } length(seq) == 32 { print $0 "\n" seq }' protein_fasta  

This code will find the fasta sequence of the particular length, suppose we want to find the fasta sequence with length 4,000 aa. The code sometimes fail to find the large proteins, which might be fixed with conversion of ==  to >= symbol.
awk '!/^>/ { next } { getline seq } length(seq) == 4,000 { print $0 "\n" seq }' protein_fasta 
###################################################
To find a particular type of proteins in the protein list, suppose nuclease
awk '/nuclease/' protein_fasta > nuclease
wc -l nuclease
###################################################
To find start and end position of a sequence. After BLAST, find positions of subject, extract position in bp ($2 or $2 depending on BLAST output format). Join both positions, for ease of analysis get rid of RHS digits; if $1 value is big, seq is in plus strand and if $2 value is big, seq is in minus strand; print the plus and minus strand-located sequences

grep "TGAACCGCCCCGGCAGGATGT" fasta_file > start_position
awk '/Sbjct/' start_position > only_subj_start
awk '{print $2}' only_subj_start > column_2

grep "AGATCAGAGAGTCTCGGTTCA" fasta_file > end_position
awk '/Sbjct/' end_position > only_subj_end
awk '{print $4}' only_subj_end > column_4

paste column_2 column_4 > start_end.txt
sort -n start_end.txt > sorted_start_end.txt

cut -f 1 sorted_start_end.txt > field_1
#Remove last 3 characters
sed 's/...$//' field_1 > field1

cut -f 2 sorted_start_end.txt > field_2
#Remove last 3 characters
sed 's/...$//' field_2 > field2

paste field1 field2 > trimmed
cat trimmed| wc -l
#For plus strand
awk '$1 < $2 {print $0}' trimmed > trimmed1
#For minus strand
awk '$1 > $2 {print $0}' trimmed > trimmed2
cat trimmed1
cat trimmed1 | wc -l
cat trimmed2
cat trimmed2 | wc -l
###################################################
To find unique insertion location of a sequence, extract field with position, sort them numerically to see common occurrences, find the unique numbers
awk  '{print $2}' filename > only_field_2
cat only_field_2 | sort -n | uniq -c
###################################################
#To find the count of A, T, C, G in fasta sequence s
while read isolate
do

grep -v '^>' /home/pseema/Desktop/virus_genomes/dengue_genomes/$isolate.fasta | tr -d '\n' | sed -e 's/\(.\)/\1\n/g' | sort | uniq -c | sort -rn  > /home/pseema/Desktop/virus_genomes/results/combined_dengue_$isolate.GC

echo "AGTC profile of Dengue $isolate..."
echo "A G T C"
cat /home/pseema/Desktop/virus_genomes/results/combined_dengue_$isolate.GC
echo "A G T C sorted"
cat /home/pseema/Desktop/virus_genomes/results/combined_dengue_$isolate.GC | sort -n
awk '/A/ {print $1}' /home/pseema/Desktop/virus_genomes/results/combined_dengue_$isolate.GC
awk '/G/ {print $1}'  /home/pseema/Desktop/virus_genomes/results/combined_dengue_$isolate.GC
awk '/T/ {print $1}'  /home/pseema/Desktop/virus_genomes/results/combined_dengue_$isolate.GC
awk '/C/ {print $1}' /home/pseema/Desktop/virus_genomes/results/combined_dengue_$isolate.GC


done < denv_list

# Convert fasta to tab

awk 'BEGIN{RS=">"}{gsub("\n"," ",$0); print ">"$0}' file.fa > file.tab

#Convert tab to fasta
awk '{print ""$1" "$2" "$3" "$4" "$5"\n"$6}' file.tab > file.fasta

#Find empty fasta 
grep -c "^$" db.fa

#To remove the empty fasta seq
awk 'BEGIN {RS = ">" ; FS = "\n" ; ORS = ""} $2 {print ">"$0}' file.fa

#To rename FAsta sequence header name as that of the file (very useful)
#It renames header as the isolate name or protein name
 #! usr/bin/bash
while read isolate;
do
echo "Starting the $isolate genome"
awk '/>/{sub(">","&"FILENAME"_");sub(/\.fasta/,x)}1' /home/pseema/denovo_analysis/dir1/$isolate.fasta > /home/pseema/denovo_analysis/dir2/$isolate.fasta
done < /home/pseema/denovo_analysis/input_files/isolate_list

#filter out fasta File by pattern (take only words with a pattern )
 awk '/^>/ {P=index($0,"pattern")==0} {if(P) print} ' in.fasta > out.fasta
 
 #removes duplicate fasta seq by IDs
 awk 'BEGIN{RS=">"}NR>1{sub("\n","\t"); gsub("\n",""); print RS$0}' biostarhelp.txt | awk '!seen[$1]++' | awk -v OFS="\n" '{print $1,$2}'


#Extracting Multiple Fasta Sequences At A Time From A File Containing Many Sequences
usage: perl thisScript.pl fastaFile.fa queryFile.txt
use Bio::DB::Fasta;

my $fastaFile = shift;
my $queryFile = shift;

my $db = Bio::DB::Fasta->new( $fastaFile );
open (IN, $queryFile);
while (<IN>){
    chomp;
    $seq = $_;
    my $sequence = $db->seq($seq);
    if  (!defined( $sequence )) {
            die "Sequence $seq not found. \n"
    }  
    print ">$seq\n", "$sequence\n";
}

No comments:

Post a Comment