Tuesday, December 1, 2015

Language: Perl (Operators, Variables, Arrays, Scalar, loops, conditions, code snippets)...

Practical Extraction and Reporting Language
http://www.tutorialspoint.com/ is a good link to learn perl
-------------------------------------------------------------------------------------------------------------
#! /usr/bin/perl 
use strict;
use warnings;
-------------------------------------------------------------------------------------------------------------
CPAN (Comprehensive Perl Archive Network)

Operators
use strict; use warnings;

Arithmetic
my $x = 5; my $y = 10;
print "x is $x and  y is $y\n";

my $add = $x + $y;
print "Sum will be: $x + $y = $add\n";

my $subtract = $x - $y;
print "Subtraction will be: $x - $y = $subtract\n";

my $multiplied_value = $x * $y;
print "Multiplication will be: $x * $y = $multiplied_value\n";

my $division = $x / $y;
print "Division will be: $x / $y = $division\n";

my $exponential_value = $x ** $y;
print "Exponential will be: $x ** $y = $exponential_value\n";

my $remainder = $x % $y;
print "Reminder will be: $x % $y = $remainder\n";
x is 5 and  y is 10                                                                                                                          
Sum will be: 5 + 10 = 15                                                                                                                     
Subtraction will be: 5 - 10 = -5                                                                                                             
Multiplication will be: 5 * 10 = 50                                                                                                          
Division will be: 5 / 10 = 0.5                                                                                                               
Exponential will be: 5 ** 10 = 9765625                                                                                                       
Reminder will be: 5 % 10 = 5 

my $current_value = $x;
print "\nCurrent Value of x is : $current_value\n";
print "Raised Value of x: $x + 1 \n";
print "Lowered Value of x: $x - 1\n";
Current Value of x is : 5                                                                                                                    
Raised Value of x: 5 + 1                                                                                                                     
Lowered Value of x: 5 - 1     

String
my $string_1 = "\nI like fruits.";my $string_2 = "Mango is my favorite.";
print "$string_1\n"; print "$string_2\n";
#Join two strings
my $joined_string = $string_1 . ' ' . $string_2;
print "Combined sentence is: $joined_string\n";
I like fruits.                                                                                                                               
Mango is my favorite.                                                                                                                        
Combined sentence is:                                                                                                                        
I like fruits. Mango is my favorite.  
#########################
Arrays
use strict; use warnings;
my @first_array = ('DNA', 'ATGGGC', 8, 3,'RNA', 'AUGC');
print $first_array[0], "\n"; print $first_array[1], "\n";
DNA                                                                                                                                                                             
ATGGGC 
my $size_array1 = scalar(@first_array);
my $size_array2 = @first_array;
print "Scalar of array: $size_array1\n";
print "Index size of array: $#first_array\n";
print "size_array2: $size_array2\n";
Scalar of array: 6                                                                                                                                                              
Index size of array: 5                                                                                                                                                          
size_array2: 6 
#########################

Variable denoted by $ sign.
Variable on the left-hand side and the expression to be evaluated on the right
e.g. a=5
----------------------------------
Conditional decision: if..else, elseif, switch
Statement: break, continue
----------------------------------
Loops: for, while, do....while, foreach
for (initialization; condition; increment)
{
   code to be executed;
}

while (condition)
{
   code to be executed;
}

do
{
   code to be executed;
}
while (condition);


foreach (array as value)
{
   code to be executed;
}
----------------------------------
For loop to iterate over the array to find element at each indices
for (my $i=0; $i<=$#first_array; $i++) {
    print "Array index: $i\n";
    print "$first_array[$i] \n";
}
Array index: 0                                                                                                                                                                  
DNA                                                                                                                                                                             
Array index: 1                                                                                                                                                                  
ATGGGC                                                                                                                                                                          
Array index: 2                                                                                                                                                                  
8                                                                                                                                                                               
Array index: 3                                                                                                                                                                  
3                                                                                                                                                                               
Array index: 4                                                                                                                                                                  
RNA                                                                                                                                                                             
Array index: 5                                                                                                                                                                  
AUGC                                                                                                                                                                            
#########################
Using foreach loop to find key and corresponding value of hash
my %sequence = ('DNA1' =>'ATCCTTGCT','DNA2' =>'ATCTCGGCTTGCT', 'RNA' =>'CUCUGCC', 'Number of seq'=> 2);
print $sequence{'DNA1'}, "\n";

foreach my $key (sort (keys %sequence)) {
    print "Key of hash is $key and Value of hash is $sequence{$key}\n";
}
ATCCTTGCT                                                                                                                                                                       
Key of hash is DNA1 and Value of hash is ATCCTTGCT                                                                                                                              
Key of hash is DNA2 and Value of hash is ATCTCGGCTTGCT                                                                                                                          
Key of hash is Number of seq and Value of hash is 2                                                                                                                             
Key of hash is RNA and Value of hash is CUCUGCC 
-------------------------------------------------
#Install Perl module called Term-Animation.
#Open a command-line terminal (select Applications > Accessories > Terminal),
#Install Term::Animation
sudo apt-get install libcurses-perl
cd /tmp
wget http://search.cpan.org/CPAN/authors/id/K/KB/KBAUCOM/Term-Animation-2.4.tar.gz
tar -zxvf Term-Animation-2.4.tar.gz
cd Term-Animation-2.4/
perl Makefile.PL && make && make test
sudo make install
-------------------------------------------------

###############################################
print "Hello\n";
Hello 

$a = 5;
print "Value of a = $a\n";
Value of a = 5
###############################################
Identifier example
$a = 5;
$var = <<"EOF";
I like autumn.
Trees turn golden.
a = $a
EOF
print "$var\n";
I like autumn.                                                                                     
Trees turn golden.                                                                                 
a = 5  
###############################################
###############################################
#To remove fasta headers of sequences in a file
#data file is named file
>E.coli
ATCGGTATGGA

>B. subtilis

ATTAACCGGTAT
#script file is script.pl
#Execute as perl script.pl file
------------------------------
use strict;
use warnings;

my $seq;
while (<>) {
    if (/^>/) {
        print "$seq\n" if $seq;
        undef $seq;
        next;
    }

    chomp;
    $seq .= $_;

    print "$seq\n" if $seq and eof ARGV;
}

ATCGGTATGGA                                                                                               
ATTAACCGGTAT
###############################################
#Transcription (DNA into RNA)
#For this code to wok use strict; and use warning; need to be suppressed.
#use strict;
#use warnings;

$DNA ='ATGGCACCTTAG';
print "DNA sequence is: \n";
print $DNA, "\n";
$RNA = $DNA;
$RNA =~ s/T/U/g;
print "RNA sequence is: \n";
print "$RNA\n";
exit;
DNA sequence is:                                                                                          
ATGGCACCTTAG                                                                                              
RNA sequence is:                                                                                          
AUGGCACCUUAG
------------------------------------
#Another script for transcription
# This script converts DNA seq into RNA seq in user input fashion. Asks for file name having DNA.
#Its good for 1 sequence
print "Name of the file conainng DNA sequence:= ";
$dnafilename = <STDIN>;
chomp $dnafilename;
unless ( open(DNAFILE, $dnafilename) ) {
    print "Cannot open file \"$dnafilename\"\n\n";
    goto h;
}
@DNA = <DNAFILE>;
close DNAFILE;
$DNA = join( '', @DNA);
print "The original DNA Sequence :=\n\n";
$DNA =~ s/\s//g;
print "$DNA\n\n";
$RNA = $DNA;
$RNA =~ s/T/U/g;
$RNA =~ s/t/u/g;
print "Transcribing DNA TO RNA :=\n\n";
print "$RNA\n";
<STDIN>;                                

################################################
#Translation (DNA to prtein conversion)
#User input-based. The file should have DNA seq without '>' symbol.
print "Enter the file name with DNA sequence= ";
$DNAfilename = <STDIN>;
chomp $DNAfilename;
unless ( open(DNAFILE, $DNAfilename) ) {
    print "Cannot open file \"$DNAfilename\"\n\n";
}
@DNA = <DNAFILE>;
close DNAFILE;
$DNA = join( '', @DNA);
print " \nThe original DNA file is:\n$DNA \n";
$DNA =~ s/\s//g;
my $protein='';
my $codon;
for(my $i=0;$i<(length($DNA)-2);$i+=3)
{
$codon=substr($DNA,$i,3);
$protein.=&codon2aa($codon);
}
print "The translated protein is :\n$protein\n";
<STDIN>;

sub codon2aa{
my($codon)=@_;
$codon=uc $codon;
my(%g)=('TCA'=>'S','TCC'=>'S','TCG'=>'S','TCT'=>'S','TTC'=>'F','TTT'=>'F','TTA'=>'L','TTG'=>'L','TAC'=>'Y','TAT'=>'Y','TAA'=>'_','TAG'=>'_','TGC'=>'C','TGT'=>'C','TGA'=>'_','TGG'=>'W','CTA'=>'L','CTC'=>'L','CTG'=>'L','CTT'=>'L','CCA'=>'P','CCC'=>'P','CCG'=>'P','CCT'=>'P','CAC'=>'H','CAT'=>'H','CAA'=>'Q','CAG'=>'Q','CGA'=>'R','CGC'=>'R','CGG'=>'R','CGT'=>'R','ATA'=>'I','ATC'=>'I','ATT'=>'I','ATG'=>'M','ACA'=>'T','ACC'=>'T','ACG'=>'T','ACT'=>'T','AAC'=>'N','AAT'=>'N','AAA'=>'K','AAG'=>'K','AGC'=>'S','AGT'=>'S','AGA'=>'R','AGG'=>'R','GTA'=>'V','GTC'=>'V','GTG'=>'V','GTT'=>'V','GCA'=>'A','GCC'=>'A','GCG'=>'A','GCT'=>'A','GAC'=>'D','GAT'=>'D','GAA'=>'E','GAG'=>'E','GGA'=>'G','GGC'=>'G','GGG'=>'G','GGT'=>'G');
if(exists $g{$codon})
{
return $g{$codon};
}
else
{
print STDERR "Bad codon \"$codon\"!!\n";
exit;
}
}
The original DNA file is:                                                                                                                                     
CAGAGAGGAACTTGCTCTCGAGCTGA                                                                                                                                    
The translated protein is :                                                                                                                                   
QRGTCSRA                                                                                                                                                      
  ################################################
#Read the protein from a file.
#The data file is named protein_name. It contains a seq.
#>ATCGHTDHHGTLYRMTWTDGEE
#use strict;
#use warnings;

$proteinfilename ='protein_name';
open (PROTEINFILE, $proteinfilename);
$protein = <PROTEINFILE>;
close PROTEINFILE;
print "The protein seq is: \n";
print "$protein\n";
exit;
The protein seq is:                                                                                       
>ATCGHTDHHGTLYRMTWTDGEE 
################################################
#Concatenation of sequences
$DNA1 ='ATTGAA';
$DNA2 ='GCGATT';
print $DNA1, "\n";
print $DNA2, "\n";
$DNA3 ="$DNA1$DNA2";
$DNA3 =$DNA1. $DNA2;
print "The concatenated DNA is: \n";
print "$DNA3\n\n";
exit;
ATTGAA                                                                                                    
GCGATT                                                                                                    
The concatenated DNA is:                                                                                  
ATTGAAGCGATT   
#################################################
#Reverse complementing of  sequence
$DNA ='CGCGTGGCA';
print "$DNA \n";
$revcom = reverse $DNA;
$revcom =~ s/A/T/g;
$revcom =~ s/T/A/g;
$revcom =~ s/C/G/g;
$revcom =~ s/G/C/g;
#The line below takes care of all 4 bases and their cases
$revcom =~ tr/ACGTacgt/TGCAtgca/;
print "Reverse complement of the given DNA seq is: \n";
print "$revcom\n";
exit;
CGCGTGGCA                                                                                                 
Reverse complement of the given DNA seq is:                                                               
TGGGTGGGG 
###################################################
Manipulation of array 
=for comment
@bases = ('A', 'C', 'G', 'T');
print "\n First base is: "; print $bases[0];
print "\n Second base is: "; print $bases[1];
print "\n Third base is: "; print $bases[2];
print "\n Fourth base is: "; print "$bases[3]\n";
=cut
First base: A                                                                                            
 Second base: C                                                                                           
 Third base: G                                                                                            
 Fourth base: T  
------------------------------------
=for comment
@bases = ('A', 'C', 'G', 'T');
$base1 =pop@bases;
unshift (@bases, $base1);
print "Seq after last item was moved to the front: ";
print "@bases\n";
=cut
Seq after last item was moved to the front: T A C G 
------------------------------------
=for comment
@bases = ('A', 'C', 'G', 'T');
$base2 =shift@bases;
push (@bases, $base2);
print "Seq after first item was moved to the end: ";
print "@bases\n";
=cut
Seq after first item was moved to the end: C G T A
------------------------------------
@bases = ('A', 'C', 'G', 'T');
#print "@bases\n";
$a = @bases;
print "Scalar context:  ";
print $a, "\n";
($a) = @bases;
print "List context:  ";
print $a, "\n";
exit;
Scalar context:  4                                                                                        
List context:  A   
####################################################
#Code to extract specific fasta sequences from a fasta file
#Execution: perl  specific_seq_extraction.pl -m 'helicase' fasta_file > seq_with_header_helicase
#my fasta_file looks like as given below, where I want the sequences with pattern 'helicase'
> |helicase|
ATCGGGGTTT

> |protease| 
AATTGGCGGG

> |helicase|

TTGGATTTCC
-----------------------------------------------------------
#! /usr/bin/perl
my $usage="\nSelect specified sequences from a fasta file and print them out\n" .
    "Usage: $0 [-hvp] -f seqNamesFile [fastaFile]\n" .
    "   or  $0 [-hv] -m 'pattern' [fastaFile]\n" .
    "   or  $0 [-hv] -n 'i,j,k,...' [fastaFile]\n" .
    "   or  $0 [-hv] -l 'i-j,...' [fastaFile]\n".
    " -f: specify the filename which contains a list of sequence names\n" .
    "     Each line contains a name. Comments can be added with \"#\".\n" .
    " -p: 'p'attern.  With -f, each line contains perl type regular expression.\n" .
    "     If -p, -v & -f are specified simultaneously, the output includes\n".
    "     sequences which does NOT much any of the regular expressions.\n".
    "     NOTE: '#' should not be a part of the regular expression.\n".
    " -m 'pattern': extract the sequences whose name contains the pattern. \n".
    "               You can use perl type regular expressions\n".
    " -n 'range': take list of integers.  e.g. -n '-3,5,7-9, 12-' will\n".
    "             select sequences 1,2,3,5,7,8,9,12,13,...\n".
    "             This option can be used to reorder or doubling up some seqs\n".
    "             e.g., -n '3,1,2,2' will bring the 3rd sequence to the front,\n".
    "             and the 2nd sequence get doubled\n".
    "  -l 'size-range': select the sequences based on sequence length.\n".
    "                   only ATGC and degenerate characters (including N) are\n".
    "                   counted, but '-', '?' etc are not counted.\n".
    "                   It takes a list of integers. e.g. -l '500-' will select\n".
    "                   sequences longer than 499bp. -l '100-200,300-400' will\n".
    "                   select sequences in the given ranges (ends are included).\n".
    " -v: Select the sequences which are NOT specified or NOT matching the " .
       "pattern\n".
    "\nIf name of input file (fastaFile) is not given, STDIN is used.\n" .
    "But -f, -m, or -n is mandatory.\n" .
    "Note -vs is not tested well.\n";


my $sep = "\t";
my $lineWidth = 70;   # used to break the long sequences into lines.

our ($opt_h, $opt_f, $opt_l, $opt_m, $opt_n, $opt_v, $opt_p);
use Getopt::Std;
getopts('hf:l:m:n:vp') || die "$usage\n";
die "$usage\n" if (defined($opt_h));

if (defined($opt_f) + defined($opt_m) + defined($opt_n) + defined($opt_l) != 1) {
    die "$usage\nERROR: Please specify ONLY 1 of -f, -m, -n, or -l options.\n\n";
}

if (defined ($opt_f)) {
    @selectedSeqs = ReadSeqNameFile($opt_f);
}

die "ERROR: -p can be used only with -f.\n$usage\n"
    if (defined($opt_p) && (defined($opt_m) || defined($opt_n)));

@ARGV = ('-') unless @ARGV; # take STDIN when no arg.
my $seqFile = shift @ARGV;
my @dat = ReadInFASTA($seqFile);

if (defined ($opt_f)) {
    if (defined($opt_p)) {
@dat = SelectSeqsByPattern(\@dat, \@selectedSeqs);
    } else { # exact match
if (defined ($opt_v)) {
    # this part isn't tested well, so check it.
    @dat = SelectSeqsNotSpecified(\@dat, \@selectedSeqs);
} else {
    @dat = SelectSeqs(\@dat, \@selectedSeqs);
}
    }
} elsif (defined ($opt_m)) {
    my @tmpPattern = ($opt_m);
    @dat = SelectSeqsByPattern(\@dat, \@tmpPattern);
} elsif (defined ($opt_n)) {
    @dat = SelectSeqsByNumber(\@dat, $opt_n);
} elsif (defined ($opt_l)) {
    @dat = SelectSeqsByLength(\@dat, $opt_l);
}

PrintFASTA(@dat);

exit(0);

# takes an arg; name of a file from which data are read Then read in
# the data and make an array.  Each element of this array corresponds
# to a sequence, name tab data.
sub ReadInFASTA {
    my $infile = shift;
    my @line;
    my $i = -1;
    my @result = ();
    my @seqName = ();
    my @seqDat = ();

    open (INFILE, "<$infile") || die "Can't open $infile\n";

    while (<INFILE>) {
        chomp;
        if (/^>/) {  # name line in fasta format
            $i++;
            s/^>\s*//; s/^\s+//; s/\s+$//;
            $seqName[$i] = $_;
            $seqDat[$i] = "";
        } else {
            s/^\s+//; s/\s+$//;
    s/\s+//g;                  # get rid of any spaces
            next if (/^$/);            # skip empty line
            s/[uU]/T/g;                  # change U to T
            $seqDat[$i] = $seqDat[$i] . uc($_);
        }
# checking no occurence of internal separator $sep.
die ("ERROR: \"$sep\" is an internal separator.  Line $. of " .
     "the input FASTA file contains this charcter. Make sure this " .
     "separator character is not used in your data file or modify " .
     "variable \$sep in this script to some other character.\n")
    if (/$sep/);
    }
    close(INFILE);

    foreach my $i (0..$seqName) {
$result[$i] = $seqName[$i] . $sep . $seqDat[$i];
    }
    return (@result);
}
sub GetSeqDat {
    my @data = @_;
    my @line;
    my @result = ();

    foreach my $i (@data) {
@line = split (/$sep/, $i);
push @result, $line[1];
    }

    return (@result)
}

sub GetSeqName {
    my @data = @_;
    my @line;
    my @result = ();

    foreach my $i (@data) {
@line = split (/$sep/, $i);
push @result, $line[0];
    }
    return (@result)
}

sub ReadSeqNameFile {
    my $file = shift;
    my @result = ();
    open(INFILE, "<$file") || die "Can't open $file\n";

    while (<INFILE>) {
        chomp;
        s/#.*$//;    # remove comments (#)
        s/^\s+//; s/\s+$//;
        next if (/^$/);
        push @result, $_;
    }
    close(INFILE);
    return @result;
}

sub SelectSeqsNotSpecified {
    my ($seqARef, $nameARef) = @_;
    my @seqName = GetSeqName(@$seqARef);
    my @seqDat = GetSeqDat(@$seqARef);

    my %nameH = ();
    foreach my $n (@$nameARef) {
$nameH{$n} = 1;
    }

    my @result = ();
    for(my $i = 0; $i < @seqName; $i++) {
next if (defined($nameH{$seqName[$i]}));
push @result, $seqName[$i] . $sep . $seqDat[$i];
    }
    return (@result);
}

sub SelectSeqs {
    my ($seqARef, $nameARef) = @_;
    my @seqName = GetSeqName(@$seqARef);
    my @seqDat = GetSeqDat(@$seqARef);

    # make a hash table
    my %seqHash = ();
    for my $i (0..$#seqName) {
if (exists($seqHash{$seqName[$i]})) {
    die "ERROR: In fasta file, there are more than 1 entry " .
"which has the name $seqName[$i]\n";
} else {
    $seqHash{$seqName[$i]} = $seqDat[$i];
}
    }

    # select the specified seqs
    foreach my $name (@$nameARef) {
if (exists($seqHash{$name})) {
    my $tmp = $name . $sep . $seqHash{$name};
    push @result, $tmp;
} else {
    warn "WARN: $name didn't occur in the input file\n";
}
    }
    return @result;
}

sub SelectSeqsByLength {
    my ($seqARef, $rangeTxt) = @_;

    my $numSeqs = scalar(@$seqARef);

    my @seqName = GetSeqName(@$seqARef);
    my @seqDat = GetSeqDat(@$seqARef);

    my @lenArr = ();
    foreach my $thisSeq (@seqDat) {
push @lenArr, SeqLenDegen($thisSeq);
    }

    my $maxLen = Max(@lenArr);

    my @allowedRange = MkSelIndex($maxLen, $rangeTxt); # this is 0-offset
    map {$_ ++} @allowedRange; # 1-offset now

    if (defined($opt_v)) {
my @allIndex = 1..$maxLen;
my @complement = InANotInB(\@allIndex, \@allowedRange);
@allowedRange = @complement;
    }

    @result = ();
    foreach my $i (0..$#seqDat) {
if (MemberQ($lenArr[$i], \@allowedRange)) {
    push @result, $seqName[$i] . $sep . $seqDat[$i];
    warn "INFO: $seqName[$i] = $lenArr[$i] bp. selected\n";
} else {
    #warn "INFO: $seqName[$i] = $lenArr[$i] bp. ignored\n";
}
    }

    return @result;
}

# IUPAC
sub SeqLenDegen {
    my $seq = shift;

    if ($aminoAcidSeq) {
$seq =~ s/[^ACDEFGHIKLMNPQRSTVWYZ]//gi;
    } else {
$seq =~ s/[^ACGTURYMKSWBDHVN]//gi;
    }

    return length($seq);
}

sub SelectSeqsByNumber {
    my ($seqARef, $rangeTxt) = @_;

    my $numSeqs = scalar(@$seqARef);

    my @index = MkSelIndex($numSeqs, $rangeTxt);

    if (defined($opt_v)) {
my @allIndex = 0..($numSeqs-1);
my @complementIndex = InANotInB(\@allIndex, \@index);
@index = @complementIndex;
    }

    my @result = ();

    foreach my $i (@index) {
push @result, $$seqARef[$i];
    }
    return @result;
}

# This is from selectSites.pl
# returns 0 offset index
sub MkSelIndex {
    my ($max, $siteList) = @_;
    $siteList =~ s/^\s+//;
    $siteList =~ s/\s+$//;

    my @sites = split(/\s*,\s*/, $siteList);

    my @result = ();
    foreach my $item (@sites) {
if ($item =~ /^(\d+)-(\d+)$/) {
    die "ERROR: 1st number is larger than 2nd in $item\n" if ($1 > $2);
    $beginPos = $1 - 1;
    $endPos = $2 - 1;
} elsif ($item =~ /^-(\d+)$/) {
    $beginPos = 0;
    $endPos = $1 - 1;
} elsif ($item =~ /^(\d+)-$/) {
    $beginPos = $1 - 1;
    $endPos = $max-1;
} elsif ($item =~ /^(\d+)$/) {
    $beginPos = $1 - 1;
    $endPos = $1 - 1;
} else {
    die "$siteList given as the list of sites.  " .
"Make sure it is comma delimitted, and each element is " .
    " one of the forms: 23-26, 29, -10, 40-\n";
}
push (@result, $beginPos..$endPos);
    }
    return(@result);
}

sub SelectSeqsByPattern {
    my ($seqARef, $patternARef) = @_;
    my @seqName = GetSeqName(@$seqARef);
    my @seqDat = GetSeqDat(@$seqARef);

    my @result = ();
    my @matchedNames = ();
    foreach my $pattern (@$patternARef) {
for(my $i = 0; $i < @seqName; $i++) {
    if ( $seqName[$i] =~ /$pattern/ ) {
push @matchedNames, $seqName[$i];
    }
}
    }

    @matchedNames = ExtractUnique(@matchedNames);

    if (defined($opt_v)) {
my @complement = InANotInB(\@seqName, \@matchedNames);
@matchedNames = @complement;
    }

    foreach my $n (@matchedNames) {
# This has a problem if seqName contains reqular expression char
# my @indexArr = grep {$seqName[$_] =~ /^$n$/} 0..$#seqName;
my @indexArr = grep {$seqName[$_] eq $n} 0..$#seqName;
if (@indexArr != 1) {
    die "ERROR: ". scalar(@indexArr) . " sequences with name: $n\n";
}
push @result, $n . $sep . $seqDat[$indexArr[0]];
    }

    if (defined($opt_v)) {
print STDERR "INFO: ". scalar(@result) ." names didn't match the following patterns:\n";
    } else {
print STDERR "INFO: ". scalar(@result) ." names matched the following patterns:\n";
    }
    foreach my $pattern (@$patternARef) {
print STDERR "  PATTERN: '$pattern'\n";
    }

    print STDERR join("\n", @matchedNames) . "\n\n";

    return @result;
}
sub PrintFASTA {
    my @seqName = GetSeqName(@_);
    my @seqDat = GetSeqDat(@_);
    for my $i (0..$#seqDat) {
print ">$seqName[$i]\n";
my $seq = $seqDat[$i];
for (my $pos=0 ; $pos < length ($seq) ;  $pos += $lineWidth) {
    print substr($seq, $pos, $lineWidth), "\n";
}

    }
}

sub InANotInB {
    my ($aRef, $bRef) =@_;
    my %seen = ();
    my @aonly =();

    foreach my $item (@$bRef) { $seen{$item} = 1};
    foreach my $item (@$aRef) {
push (@aonly, $item) unless $seen{$item};
    }
    return (@aonly);
}

sub ExtractUnique {
    my %seen=();
    my @unique = ();

    foreach my $item (@_) {
        push (@unique, $item) unless $seen{$item}++;
    }
    return @unique;
}

sub Max {
    my $max = shift;
    foreach $item (@_) {
        if ($item > $max) {
            $max = $item;
        }
    }
    return $max;
}

sub MemberQ {
    my ($x, $arrRef) = @_;
    foreach my $item (@$arrRef) {
        if ($x eq $item) {
            return 1;
        }
    }
    return 0;
}
------------------------------
>|helicase|                                                                                                 
ATCGGGGTTT                                                                                                  
>|helicase|                                                                                                 
ATCGGGGTTT 
########################################################
#To find the length of a fasta sequence
#Execution: perl fasta_length.pl < fasta_file > file_seq_length
my $seq = '';
while (<STDIN>)
{
    chomp;
    if (/^>/)
    {
        if ($seq ne '') { print length($seq), "\n"; $seq = ''; }
        next;
    }
    $seq .= $_;
}
if ($seq ne '') { print length($seq), "\n"; }
##########################################################

########################################################
#To find a particular motif in a fasta file
#Execution: perl motif_finder.pl < fasta_file > motif_seq
#! /usr/bin/perl
use strict;
use Bio::SeqIO;

my $usage = "perl motif_finder.pl <fasta file> <motif>";
my $fasta_filename = shift(@ARGV) or die("Usage: $usage $!");
my $motif = shift(@ARGV) or die("Usage: $usage $!");

my $fasta_parser = Bio::SeqIO->new(-file => $fasta_filename, -format => 'Fasta');
while(my $seq_obj = $fasta_parser->next_seq())
{
  printf("Searching sequence '%s'...", $seq_obj->id);
  if((my $pos = index($seq_obj->seq(), $motif)) != -1)
  {
    printf("motif found at position %d!\n", $pos + 1);
  }
  else
  {
    printf("motif not found.\n");
  }
}###################################################Most commonly-occurring k-mer search (input is a string and kmer value)
my $string="CGTGATGTG";
my $kmer=5; my %myHash;my $max=0;

for (my $aa=0; $aa<=(length($string)-5); $aa++) {
    my $myStr=substr  $string, $aa,$kmer;
    my $km=kmerMatch ($string, $myStr, $kmer);
    if ($km > $max) { $max = $km;}
    $myHash{$myStr}=$km;
}
foreach my $name (keys %myHash){
print "$name " if $myHash{$name} == $max;
}
sub kmerMatch {
my ($string, $myStr, $kmer)=@_;
my $count=0;
for (my $aa=0; $aa<=(length($string)-4); $aa++) {
    my $myWin=substr  $string, $aa,$kmer;
    if ($myWin eq $myStr) {
        $count++;
    }
}
return $count;
}
##########################################################
Identifier
EOF
'my' parameter is a way of defining variable
##########################################################
chomp: removes  newline character from the end of a string

No comments:

Post a Comment