Exploring the choppy water of coding: Python (2): For bioinformatics (Fasta manipulation).........

These are the operations that invoke the module Biopython...
Tested by pycharm IDE
file 'data' looks like

>Ecoli_123
MSPWMKKVFLQCMPKLLMMRRTKY
>Pputida_167
IKKVLLIALTLSGAMGISREKRGL
>Saureus_436
MLSPSVAIKVQVLYIGKVRISQR
>Spyogenes_768
MFPGRTIGIMITASH

############################
#To extract a particular fasta sequence from a list of fasta sequence in a file (suppose its data.txt)

from Bio import SeqIO
input_file = r'data'record_id = 'Ecoli_123'
record_dict = SeqIO.to_dict(SeqIO.parse(input_file, 'fasta'))
record = record_dict[record_id]
sequence = str(record.seq)

print sequence

MSPWMKKVFLQCMPKLLMMRRTKY

############################
#To count the number of sequences in a fasta file

from Bio import SeqIO
filename = "data"count = 0for record in SeqIO.parse(filename, "fasta"):
    count = count + 1print("There are " + str(count) + " records in the file named" + filename)

There are 4 records in the fasta file data

############################

#To find the length of a single fasta record in the file
from Bio import SeqIO
fasta_record = SeqIO.read("data", "fasta")print(fasta_record.id + " length " + str(len(fasta_record)))
Ecoli_123 length 24
----------------
#To find the length of each fasta record in the file

from Bio import SeqIO

filename = "data"for record in SeqIO.parse(filename, "fasta"):
    print('Record ' + record.id + ', length ' + str(len(record.seq)))

Record Ecoli_123, length 24
Record Pputida_167, length 24
Record Saureus_436, length 23
Record Spyogenes_768, length 15
############################

#To find protein sequences that don't start with amino acid Methionine

from Bio import SeqIO
filename = "data"bad = 0for record in SeqIO.parse(filename, "fasta"):
    if not record.seq.startswith("M"):
        bad = bad + 1        print(record.id + " starts " + record.seq[0])
print("Found " + str(bad) + " records in " + filename + " which don't start with M")

gi|16127995|ref|NP_414543.1| starts T
gi|16127995|ref|NP_414545.1| starts S
Found 2 records in data which don't start with M

############################
#To find different motifs of A, T, C, G in the given fasta sequence
#! usr/bin/python
from itertools import groupby
fh = open('file.fasta')
groups = (x[1] for x in groupby(fh, lambda l: l[0] == ">"))
for g in groups:
    header = next(g).strip()
    seq = ''.join(s.strip() for s in next(groups))
    counts = {}
    for s in [''.join(g) for k, g in groupby(seq)]:
        if s not in counts: counts[s] = 0
        counts[s]+=1
    print header
    print counts

>gi|145413593|gb|AY858039.2| Dengue virus 3 strain den3_98, complete genome
{'AA': 491, 'AAA': 166, 'AAAA': 56, 'GGGG': 25, 'CCC': 76, 'TTTTT': 9, 'AAAAAAA': 1, 'TT': 302, 'TTT': 69, 'GGG': 113, 'CCCCCC': 2, 'CCCCC': 2, 'TTTTTT': 3, 'A': 1582, 'C': 1311, 'G': 1335, 'CC': 303, 'CCCC': 13, 'TTTT': 27, 'T': 1291, 'GG': 492, 'AAAAAA': 9, 'AAAAA': 18, 'GGGGG': 4}
[pseema@lab-0-8 virus_genomes]$

Exploring the choppy water of coding

Thursday, December 10, 2015

Python (2): For bioinformatics (Fasta manipulation).........

No comments:

Post a Comment