Thursday, February 11, 2016

Language: Perl (FASTA manipulations)...........

#! /usr/bin/perl 
use strict;
use warnings; 
#To remove fasta headers of sequences in a file 
 my $seq;
while (<>) {
 if (/^>/) {
         print "$seq\n" if $seq;
         undef $seq;
          next;
 }
    chomp;
    $seq .= $_;
    print "$seq\n" if $seq and of ARGV;
 

#Transcription (DNA into RNA) 
$DNA ='ATGGCACCTTAG';
print $DNA, "\n";
$RNA = $DNA;
$RNA =~ s/T/U/g;
print "Here is the RNA sequence: \n";
print "$RNA\n";
exit;
################################################
Read the protein from a file
$proteinfilename ='protein_list.pep';
open (PROTEINFILE, $proteinfilename);
$protein = <PROTEINFILE>;
close PROTEINFILE;
print "Here is the protein: \n";
print $protein;
exit;
################################################


Concatenation of sequences
$DNA1 ='ATTG';
$DNA2 = 'GGA';
print $DNA1, "\n";
print $DNA2, "\n\n";
$DNA3 ="$DNA1$DNA2";
$DNA3 =$DNA1. $DNA2;
print "Here is concatenated DNA: \n\n";
print "$DNA3\n\n";
exit;
#################################################
Reverse complementing of  sequence
$DNA ='ATGGCA';
print "$DNA \n";
$revcom = reverse $DNA;
#$revcom =~ s/A/T/g;
#$revcom =~ s/T/A/g;
#$revcom =~ s/C
#$revcom =~ s/G/C/g;
$revcom =~ tr/ACGTacgt/TGCAtgca/;
print "Here is the reverse complement of DNA: \n";
print "$revcom\n";
exit;
###################################################
Manipulation of array 
=for comment
@bases = ('A', 'C', 'G', 'T');
print "\n first base: "; print $bases[0];
print "\n second base: "; print $bases[1];
print "\n third base: "; print $bases[2];
print "\n fourth base: "; print $bases[3];
=cut
=for comment
@bases = ('A', 'C', 'G', 'T');
$base1 =pop@bases;
unshift (@bases, $base1);
print "end item at the start: ";
print "@bases\n";
=cut
=for comment
@bases = ('A', 'C', 'G', 'T');
$base2 =shift@bases;
push (@bases, $base2);
print "start item at the end: ";
print "@bases\n";
=cut
@bases = ('A', 'C', 'G', 'T');
print "@bases\n";
$a = @bases;
print "scalar context:  ";
print $a, "\n";
($a) = @bases;
print "list context:  ";
print $a, "\n";
exit;
####################################################
 Code to extract specific fasta sequences from a fasta file
#! /usr/bin/perl
my $usage="\nSelect specified sequences from a fasta file and print them out\n" .
    "Usage: $0 [-hvp] -f seqNamesFile [fastaFile]\n" .
    "   or  $0 [-hv] -m 'pattern' [fastaFile]\n" .
    "   or  $0 [-hv] -n 'i,j,k,...' [fastaFile]\n" .
    "   or  $0 [-hv] -l 'i-j,...' [fastaFile]\n".
    " -f: specify the filename which contains a list of sequence names\n" .
    "     Each line contains a name. Comments can be added with \"#\".\n" .
    " -p: 'p'attern.  With -f, each line contains perl type regular expression.\n" .
    "     If -p, -v & -f are specified simultaneously, the output includes\n".
    "     sequences which does NOT much any of the regular expressions.\n".
    "     NOTE: '#' should not be a part of the regular expression.\n".
    " -m 'pattern': extract the sequences whose name contains the pattern. \n".
    "               You can use perl type regular expressions\n".
    " -n 'range': take list of integers.  e.g. -n '-3,5,7-9, 12-' will\n".
    "             select sequences 1,2,3,5,7,8,9,12,13,...\n".
    "             This option can be used to reorder or doubling up some seqs\n".
    "             e.g., -n '3,1,2,2' will bring the 3rd sequence to the front,\n".
    "             and the 2nd sequence get doubled\n".
    "  -l 'size-range': select the sequences based on sequence length.\n".
    "                   only ATGC and degenerate characters (including N) are\n".
    "                   counted, but '-', '?' etc are not counted.\n".
    "                   It takes a list of integers. e.g. -l '500-' will select\n".
    "                   sequences longer than 499bp. -l '100-200,300-400' will\n".
    "                   select sequences in the given ranges (ends are included).\n".
    " -v: Select the sequences which are NOT specified or NOT matching the " .
       "pattern\n".
    "\nIf name of input file (fastaFile) is not given, STDIN is used.\n" .
    "But -f, -m, or -n is mandatory.\n" .
    "Note -vs is not tested well.\n";
my $sep = "\t";
my $lineWidth = 70;   # used to break the long sequences into lines.
our ($opt_h, $opt_f, $opt_l, $opt_m, $opt_n, $opt_v, $opt_p);
use Getopt::Std;
getopts('hf:l:m:n:vp') || die "$usage\n";
die "$usage\n" if (defined($opt_h));
if (defined($opt_f) + defined($opt_m) + defined($opt_n) + defined($opt_l) != 1) {
    die "$usage\nERROR: Please specify ONLY 1 of -f, -m, -n, or -l options.\n\n";
}
if (defined ($opt_f)) {
    @selectedSeqs = ReadSeqNameFile($opt_f);
}
die "ERROR: -p can be used only with -f.\n$usage\n"
    if (defined($opt_p) && (defined($opt_m) || defined($opt_n)));
@ARGV = ('-') unless @ARGV; # take STDIN when no arg.
my $seqFile = shift @ARGV;
my @dat = ReadInFASTA($seqFile);
if (defined ($opt_f)) {
    if (defined($opt_p)) {
@dat = SelectSeqsByPattern(\@dat, \@selectedSeqs);
    } else { # exact match
if (defined ($opt_v)) {
    # this part isn't tested well, so check it.
    @dat = SelectSeqsNotSpecified(\@dat, \@selectedSeqs);
} else {
    @dat = SelectSeqs(\@dat, \@selectedSeqs);
}
    }
} elsif (defined ($opt_m)) {
    my @tmpPattern = ($opt_m);
    @dat = SelectSeqsByPattern(\@dat, \@tmpPattern);
} elsif (defined ($opt_n)) {
    @dat = SelectSeqsByNumber(\@dat, $opt_n);
} elsif (defined ($opt_l)) {
    @dat = SelectSeqsByLength(\@dat, $opt_l);
}
PrintFASTA(@dat);
exit(0);
# takes an arg; name of a file from which data are read Then read in
# the data and make an array.  Each element of this array corresponds
# to a sequence, name tab data.
sub ReadInFASTA {
    my $infile = shift;
    my @line;
    my $i = -1;
    my @result = ();
    my @seqName = ();
    my @seqDat = ();
    open (INFILE, "<$infile") || die "Can't open $infile\n";
    while (<INFILE>) {
        chomp;
        if (/^>/) {  # name line in fasta format
            $i++;
            s/^>\s*//; s/^\s+//; s/\s+$//;
            $seqName[$i] = $_;
            $seqDat[$i] = "";
        } else {
            s/^\s+//; s/\s+$//;
    s/\s+//g;                  # get rid of any spaces
            next if (/^$/);            # skip empty line
            s/[uU]/T/g;                  # change U to T
            $seqDat[$i] = $seqDat[$i] . uc($_);
        }
# checking no occurence of internal separator $sep.
die ("ERROR: \"$sep\" is an internal separator.  Line $. of " .
     "the input FASTA file contains this charcter. Make sure this " .
     "separator character is not used in your data file or modify " .
     "variable \$sep in this script to some other character.\n")
    if (/$sep/);
    }
    close(INFILE);
    foreach my $i (0..$#seqName) {
$result[$i] = $seqName[$i] . $sep . $seqDat[$i];
    }
    return (@result);
}
sub GetSeqDat {
    my @data = @_;
    my @line;
    my @result = ();
    foreach my $i (@data) {
@line = split (/$sep/, $i);
push @result, $line[1];
    }
    return (@result)
}
sub GetSeqName {
    my @data = @_;
    my @line;
    my @result = ();
    foreach my $i (@data) {
@line = split (/$sep/, $i);
push @result, $line[0];
    }
    return (@result)
}
sub ReadSeqNameFile {
    my $file = shift;
    my @result = ();
    open(INFILE, "<$file") || die "Can't open $file\n";
    while (<INFILE>) {
        chomp;
        s/#.*$//;    # remove comments (#)
        s/^\s+//; s/\s+$//;
        next if (/^$/);
        push @result, $_;
    }
    close(INFILE);
    return @result;
}
sub SelectSeqsNotSpecified {
    my ($seqARef, $nameARef) = @_;
    my @seqName = GetSeqName(@$seqARef);
    my @seqDat = GetSeqDat(@$seqARef);
    my %nameH = ();
    foreach my $n (@$nameARef) {
$nameH{$n} = 1;
    }
    my @result = ();
    for(my $i = 0; $i < @seqName; $i++) {
next if (defined($nameH{$seqName[$i]}));
push @result, $seqName[$i] . $sep . $seqDat[$i];
    }
    return (@result);
}
sub SelectSeqs {
    my ($seqARef, $nameARef) = @_;
    my @seqName = GetSeqName(@$seqARef);
    my @seqDat = GetSeqDat(@$seqARef);
    # make a hash table
    my %seqHash = ();
    for my $i (0..$#seqName) {
if (exists($seqHash{$seqName[$i]})) {
    die "ERROR: In fasta file, there are more than 1 entry " .
"which has the name $seqName[$i]\n";
} else {
    $seqHash{$seqName[$i]} = $seqDat[$i];
}
    }
    # select the specified seqs
    foreach my $name (@$nameARef) {
if (exists($seqHash{$name})) {
    my $tmp = $name . $sep . $seqHash{$name};
    push @result, $tmp;
} else {
    warn "WARN: $name didn't occur in the input file\n";
}
    }
    return @result;
}
sub SelectSeqsByLength {
    my ($seqARef, $rangeTxt) = @_;
    my $numSeqs = scalar(@$seqARef);
    my @seqName = GetSeqName(@$seqARef);
    my @seqDat = GetSeqDat(@$seqARef);
    my @lenArr = ();
    foreach my $thisSeq (@seqDat) {
push @lenArr, SeqLenDegen($thisSeq);
    }
    my $maxLen = Max(@lenArr);
    my @allowedRange = MkSelIndex($maxLen, $rangeTxt); # this is 0-offset
    map {$_ ++} @allowedRange; # 1-offset now
    if (defined($opt_v)) {
my @allIndex = 1..$maxLen;
my @complement = InANotInB(\@allIndex, \@allowedRange);
@allowedRange = @complement;
    }
    @result = ();
    foreach my $i (0..$#seqDat) {
if (MemberQ($lenArr[$i], \@allowedRange)) {
    push @result, $seqName[$i] . $sep . $seqDat[$i];
    warn "INFO: $seqName[$i] = $lenArr[$i] bp. selected\n";
} else {
    #warn "INFO: $seqName[$i] = $lenArr[$i] bp. ignored\n";
}
    }
    return @result;
}
# IUPAC
sub SeqLenDegen {
    my $seq = shift;
    if ($aminoAcidSeq) {
$seq =~ s/[^ACDEFGHIKLMNPQRSTVWYZ]//gi;
    } else {
$seq =~ s/[^ACGTURYMKSWBDHVN]//gi;
    }
    return length($seq);
}
sub SelectSeqsByNumber {
    my ($seqARef, $rangeTxt) = @_;
    my $numSeqs = scalar(@$seqARef);
    my @index = MkSelIndex($numSeqs, $rangeTxt);
    if (defined($opt_v)) {
my @allIndex = 0..($numSeqs-1);
my @complementIndex = InANotInB(\@allIndex, \@index);
@index = @complementIndex;
    }
    my @result = ();
    foreach my $i (@index) {
push @result, $$seqARef[$i];
    }
    return @result;
}
# This is from selectSites.pl
# returns 0 offset index
sub MkSelIndex {
    my ($max, $siteList) = @_;
    $siteList =~ s/^\s+//;
    $siteList =~ s/\s+$//;
    my @sites = split(/\s*,\s*/, $siteList);
    my @result = ();
    foreach my $item (@sites) {
if ($item =~ /^(\d+)-(\d+)$/) {
    die "ERROR: 1st number is larger than 2nd in $item\n" if ($1 > $2);
    $beginPos = $1 - 1;
    $endPos = $2 - 1;
} elsif ($item =~ /^-(\d+)$/) {
    $beginPos = 0;
    $endPos = $1 - 1;
} elsif ($item =~ /^(\d+)-$/) {
    $beginPos = $1 - 1;
    $endPos = $max-1;
} elsif ($item =~ /^(\d+)$/) {
    $beginPos = $1 - 1;
    $endPos = $1 - 1;
} else {
    die "$siteList given as the list of sites.  " .
"Make sure it is comma delimitted, and each element is " .
    " one of the forms: 23-26, 29, -10, 40-\n";
}
push (@result, $beginPos..$endPos);
    }
    return(@result);
}
sub SelectSeqsByPattern {
    my ($seqARef, $patternARef) = @_;
    my @seqName = GetSeqName(@$seqARef);
    my @seqDat = GetSeqDat(@$seqARef);
    my @result = ();
    my @matchedNames = ();
    foreach my $pattern (@$patternARef) {
for(my $i = 0; $i < @seqName; $i++) {
    if ( $seqName[$i] =~ /$pattern/ ) {
push @matchedNames, $seqName[$i];
    }
}
    }
    @matchedNames = ExtractUnique(@matchedNames);
    if (defined($opt_v)) {
my @complement = InANotInB(\@seqName, \@matchedNames);
@matchedNames = @complement;
    }
    foreach my $n (@matchedNames) {
# This has a problem if seqName contains reqular expression char
# my @indexArr = grep {$seqName[$_] =~ /^$n$/} 0..$#seqName;
my @indexArr = grep {$seqName[$_] eq $n} 0..$#seqName;
if (@indexArr != 1) {
    die "ERROR: ". scalar(@indexArr) . " sequences with name: $n\n";
}
push @result, $n . $sep . $seqDat[$indexArr[0]];
    }
    if (defined($opt_v)) {
print STDERR "INFO: ". scalar(@result) ." names didn't match the following patterns:\n";
    } else {
print STDERR "INFO: ". scalar(@result) ." names matched the following patterns:\n";
    }  
    foreach my $pattern (@$patternARef) {
print STDERR "  PATTERN: '$pattern'\n";
    }
    print STDERR join("\n", @matchedNames) . "\n\n";
    return @result;
}
sub PrintFASTA {
    my @seqName = GetSeqName(@_);
    my @seqDat = GetSeqDat(@_);
    for my $i (0..$#seqDat) {
print ">$seqName[$i]\n";
my $seq = $seqDat[$i];
for (my $pos=0 ; $pos < length ($seq) ;  $pos += $lineWidth) {
    print substr($seq, $pos, $lineWidth), "\n";
}
    }
}
sub InANotInB {
    my ($aRef, $bRef) =@_;
    my %seen = ();
    my @aonly =();
    foreach my $item (@$bRef) { $seen{$item} = 1};
    foreach my $item (@$aRef) {
push (@aonly, $item) unless $seen{$item};
    }
    return (@aonly);
}
sub ExtractUnique {
    my %seen=();
    my @unique = ();
    foreach my $item (@_) {
        push (@unique, $item) unless $seen{$item}++;
    }
    return @unique;
}
sub Max {
    my $max = shift;
    foreach $item (@_) {
        if ($item > $max) {
            $max = $item;
        }
    }
    return $max;
}
sub MemberQ {
    my ($x, $arrRef) = @_;
    foreach my $item (@$arrRef) {
        if ($x eq $item) {
            return 1;
        }
    }
    return 0;
}
Execution: perl  specific_seq_extraction.pl -m 'helicase' fasta_file > seq_with_header_helicase
################################################################################
#To find the length of a fasta sequence
my $seq = '';
while (<STDIN>)
{
    chomp;
    if (/^>/)
    {
        if ($seq ne '') { print length($seq), "\n"; $seq = ''; }
        next;
    }
    $seq .= $_;
}
if ($seq ne '') { print length($seq), "\n"; }
Execution: perl fasta_length.pl < fasta_file > file_seq_length
################################################################################
#To open a file. Not recommended for big file
$string_filename = 'file.txt';
open(FILE, $string_filename); @array = FILE;

#Manipulation of a FASTA file. Fragment length determination and codon findings

# Open the file. If could not open, echo that couldn't be read.
$string_filename = 'file.txt';
open(FILE, $string_filename) || die("Couldn't read file $string_filename\n");
#Read FASTA record. Assig the seq to variable named local.
local $/ = "\n>";
#Condition test by while loop.
while (my $seq = <>) {

#Use chomp to get rid of trailing spaces

chomp $seq;
    $seq =~ s/^>*.+\n//;  # remove FASTA header
    $seq =~ s/\n//g;  # remove endlines

    $R = length $seq;

    if ( $seq =~ /ggc/ ) {
        $M = $';
        $W = length $M;
        if ( $seq =~ /atg/ ) {
            $K = $`;
            $Z = length $K;
            $x = $W + $Z - $R;  
            print "\n\ the distance is the following: $x\n\n";
        } else {
            print "\n\ I couldn't find the start codon.\n\n";
        }
    } else {
        print "\n\ I couldn't find the binding site.\n\n"; }
    }

}  # end while


close FILE;

exit;

#Manipulation of a FASTA file. Fragment length determination and codon findings (with bioperl)

use Bio::SeqIO;
my $seqio = Bio::SeqIO->new(-file => "file.fa", '-format' => 'Fasta');
while(my $seq = $seqio->next_seq) {
  my $string = $seq->seq;
  # do stuff with $string
}
while(my $string =~/atg(.*)ggc/gi) {
  # do something with match
  # e.g. match start = $-[0]+1, match end = $+[0]
}
#Script to remove FASTA sequences shorter that your specified number . Here all seq les than 200 will be deleted
#Use: perl remove_small.pl 200 input.fasta > result.fasta
#!/usr/bin/perl
use strict;
use warnings;

my $minlen = shift or die "Error: `minlen` parameter not provided\n";
{
    local $/=">";
    while(<>) {
        chomp;
        next unless /\w/;
        s/>$//gs;
        my @chunk = split /\n/;
        my $header = shift @chunk;
        my $seqlen = length join "", @chunk;
        print ">$_" if($seqlen >= $minlen);
    }
    local $/="\n";
}
Uses
?
#To filter the sequence by its length
#Use: perl Seq_filter.pl -i INPUT.TXT -min 30 -max 60
#!/usr/bin/env perl
@input_files=();
$min_length=0;
$max_length=999999999999;
$treat_long=0;
$|=1;

###   CHECK COMMAND LINE ARGUMENTS   ###
if(@ARGV==0)
{
print"No arguments passed to the script!\nIf you entered arguments try the following command:\nperl length_cutoff.pl -argument1 -argument2 ...\n\n";
exit;
}

$argv="";
foreach(@ARGV)
{
$argv.=$_;
}
@arguments=split('-',$argv);

foreach(@arguments)
{
if($_=~/^ *i/)
{
$_=~s/^ *i//;
$_=~s/ //g;
push(@input_files,$_);
}
elsif($_=~/^ *I/)
{
$_=~s/^ *I//;
$_=~s/ //g;
open(FILES_IN,"$_");
while(<FILES_IN>)
{
unless($_=~/^\s*$/)
{
$_=~s/\s//sg;
push(@input_files,$_);
}
}
}
elsif($_=~/^ *min/)
{
$_=~s/^ *min//;
$_=~s/ //g;
$min_length=$_;
unless($min_length=~/^\d+$/)
{
print"Minimum sequence length has to be numerical!\n";
exit;
}
}
elsif($_=~/^ *max/)
{
$_=~s/^ *max//;
$_=~s/ //g;
$max_length=$_;
unless($max_length=~/^\d+$/)
{
print"Maximum sequence length has to be numerical!\n";
exit;
}
}
elsif($_=~/^ *[01]/)
{
$_=~s/ *//g;
$_=~s/ //g;
$treat_long=$_;
}
elsif($_!~/^\s*$/)
{
print"Don't know how to treat argument $_!\nIt will be ignored.\n\n";
}
}
unless($min_length<=$max_length)
{
print"Maximum sequence length must be >= minimum seuquence length!\n";
exit;
}
if(@input_files==0)
{
print"No input file specified!\n";
exit;
}

###   PRINT ARGUMENTS   ###
print"The following files will be processed:\n";
foreach(@input_files)
{
if(-e $_)
{
print"$_\n";
push(@input_files_ok,$_);
}
else
{
print"could not find file: $_. It will be ignored.\n";
}
}
print"Minimum seuquence length: $min_length\nMaximum seuquence length: $max_length\n";
if($treat_long==1)
{
print"Sequences exceeding maximum length will be clipped to proper size.\n\n";
}

###   START   ###
open(OUT_OK,">sequences_ok.fas");
unless($treat_long==1)
{
open(OUT_LONG,">sequences_too_long.fas");
}
open(OUT_SHORT,">sequences_too_short.fas");
$seqs_ok=0;
$seqs_long=0;
$seqs_short=0;
foreach$file(@input_files_ok)
{
print"processing $file";
open(IN,$file);
while(<IN>)
{
if($_=~/^>/)
{
$title=$_;
}
elsif($_!~/^\s*$/)
{
chomp$_;
if(length$_>=$min_length)
{
if(length$_<=$max_length)
{
$seqs_ok++;
print OUT_OK "$title$_\n";
}
elsif($treat_long==0)
{
$seqs_long++;
print OUT_LONG "$title$_\n";
}
elsif($treat_long==1)
{
$_=substr($_,0,$max_length);
$seqs_long++;
print OUT_OK "$title$_\n";
}
}
else
{
$seqs_short++;
print OUT_SHORT "$title$_\n";
}
}
}
close IN;
print" done.\n";
}
close OUT_OK;
close OUT_LONG;
close OUT_SHORT;
print"Sequences with proper size:\t$seqs_ok\nSequences too long:\t\t$seqs_long\nSequences too short:\t\t$seqs_short\n\n";
exit;

#To filter the FASTA file by sequence name.
#Use: perl Sort_fasta.pl [options] My_input
#!/usr/bin/perl -w

my $usage="\nUsage: $0 [-hrg] [fastaFileName1 ...]\n".
    "  -h: help\n".
    "  -r: reverse\n" .
    "  -g: remove gaps '-' from the sequence\n".
    "Sort FASTA sequences alphabetically by names.  If multiple files are \n".
    "given, sequences in all files are marged before sorting.  If no \n".
    "argument is given, it will take STDIN as the input.\n" .
    "Note that the entire sequence label including spaces is used as\n".
    "the name.\n";

our($opt_h, $opt_g, $opt_r);

use Bio::SeqIO;

use Getopt::Std;
getopts('hgr') || die "$usage\n";
die "$usage\n" if (defined($opt_h));

my $format = "fasta";
my @seqArr = ();

@ARGV = ('-') unless @ARGV;
while (my $file = shift) {
    my $seqio_obj = Bio::SeqIO->new(-file => $file, -format => $format);
    while (my $seq = $seqio_obj->next_seq()) {
       # need to deal with spaces
 $seq->desc( $seq->id . " ". $seq->desc);

 push(@seqArr, $seq);
    }
}

if (defined($opt_r)) {
    @seqArr = sort { - ($a->desc() cmp $b->desc()) } @seqArr;
} else {
    @seqArr = sort { $a->desc() cmp $b->desc() } @seqArr;
}

my $seqOut = Bio::SeqIO->new(-fs => \*STDOUT, -format => $format);
foreach my $s (@seqArr) {
    # prints "id desc", and desc was modified, returning it to original
    my $thisDesc = $s->desc;
    $thisDesc =~ s/^\S+ //; # remove the first word.
    $s->desc($thisDesc);

    if(defined($opt_g)) {
 my $tmp = $s->seq();
 $tmp =~ s/-//g;
 $s->seq($tmp);
    }
    $seqOut->write_seq($s);
}

exit;