Exploring the choppy water of coding: Python (3): Biopython.............

Examples from http://biopython.org/
ftp://ftp.expasy.org/databases (required dat files can be downloaded from here)
-------------------
Execute script as: python script.py
#To check Installation of Biopython
import Bio
# Frequently-used modules of python are Seq, pairwise2, AlignIO
-----------------------------------------------------------------------------------------------

#Import Seq module from Bio library of python
#! usr/bin/python
from Bio.Seq import Seq
#Create a sequence object
my_seq = Seq('ATGCGGATTGCAGGT')
#prints seq, reverse complement and protein
print 'DNA seq is: %s' % (my_seq)
print 'Seq %s is %i bases long' % (my_seq, len(my_seq))
print 'Reverse complement of DNA is: %s' % my_seq.reverse_complement()
print 'Transcribed mRNA is: %s' % my_seq.transcribe()
print 'Translated protein is: %s' % my_seq.translate()
-----------------------------------------------------------------------------------------------
#Import pairwise2 module from Bio library of python for sequence alignment
#! usr/bin/python
from Bio import pairwise2
from Bio.pairwise2 import format_alignment
for i in pairwise2.align.globalxx("GGTCCTTAG", "TTTCGGAAG"):
print(format_alignment(*i))

from Bio.Align.Applications import ClustalwCommandline
help(ClustalwCommandline)
#clustalw
from Bio.Align.Applications import ClustalwCommandline
cline = ClustalwCommandline("clustalw2", infile="opuntia.fasta")
print(cline)
#Matrix-based protein sequence alignment
#! usr/bin/python
from Bio import pairwise2
from Bio.pairwise2 import format_alignment
from Bio.SubsMat import MatrixInfo as matlist
matrix = matlist.blosum62
for i in pairwise2.align.globaldx("RTSMNRWT", "MPSTW", matrix):
print(format_alignment(*i))

#Multiple sequence alignment (MSA). MSA can be made by loading an alignment file (.aln) using AlignIO module

#! usr/bin/python
from Bio import AlignIO
align = AlignIO.read("opuntia.aln", "clustal")
print(align)
len(align)
for record in align:
    print("%s %i" % (record.id, len(record)))
#Extract first row
print(align[0].id)
#Extract last row
print(align[-1].id)
#Extract columns as strings row (here index 2, i.e. column3)
print(align[:, 2])
#Extract first five columns
print(align[:, :5])

#To find the count of A, T, C, G (their % too) in fasta sequence
#Use annotation file .ffn for it (It has individual gene ATCG sequence)
#Run as: python GC_content.py
#! usr/bin/python
from Bio import SeqIO

#Opened a FASTA file
input_file = open('L_11232015.ffn', 'r')
#Opened a output file
output_file = open('GC_content','w')
#Header line
#Header looks likie Gene   A   C   G   T   Length   CG%
output_file.write('Gene\tA\tC\tG\tT\tLength\tCG%\n')

for cur_record in SeqIO.parse(input_file, "fasta") :
#count nucleotides in this record...
   gene_name = cur_record.name
   A_count = cur_record.seq.count('A')
   C_count = cur_record.seq.count('C')
   G_count = cur_record.seq.count('G')
   T_count = cur_record.seq.count('T')
   length = len(cur_record.seq)
   cg_percentage = float(C_count + G_count) / length
   output_line = '%s\t%i\t%i\t%i\t%i\t%i\t%f\n' % \
   (gene_name, A_count, C_count, G_count, T_count, length, cg_percentage)
   output_file.write(output_line)

output_file.close()
input_file.close()

#QueryResult (compare query to the reference for hits)
from Bio import SearchIO
blast_qresult = SearchIO.read('my_blast.xml', 'blast-xml')
print(blast_qresult)

#Open a Swiss-Prot file
handle = open("myswissprotfile.dat")

import gzip
handle = gzip.open("myswissprotfile.dat.gz")

import urllib
handle = urllib.urlopen("http://www.somelocation.org/data/someswissprotfile.dat")
from Bio import ExPASy
handle = ExPASy.get_sprot_raw(myaccessionnumber)

#To read one Swiss-Prot record from the handle
from Bio import SwissProt
record = SwissProt.read(handle)
print(record.description)
print(record.organism_classification)

from Bio.ExPASy import Prosite
handle = open("prosite.dat")
records = Prosite.parse(handle)
record = next(records)
record.accession
record.name

from Bio.ExPASy import Enzyme
handle = open("enzyme.dat")
records = Enzyme.parse(handle)
ecnumbers = [record["ID"] for record in records]

Exploring the choppy water of coding

Monday, January 4, 2016

Python (3): Biopython.............

No comments:

Post a Comment