These are the operations that invoke the module Biopython...
Tested by pycharm IDE
file 'data' looks like
#To extract a particular fasta sequence from a list of fasta sequence in a file (suppose its data.txt)
#To count the number of sequences in a fasta file
#To find different motifs of A, T, C, G in the given fasta sequence
#! usr/bin/python
from itertools import groupby
fh = open('file.fasta')
groups = (x[1] for x in groupby(fh, lambda l: l[0] == ">"))
for g in groups:
header = next(g).strip()
seq = ''.join(s.strip() for s in next(groups))
counts = {}
for s in [''.join(g) for k, g in groupby(seq)]:
if s not in counts: counts[s] = 0
counts[s]+=1
print header
print counts
>gi|145413593|gb|AY858039.2| Dengue virus 3 strain den3_98, complete genome
{'AA': 491, 'AAA': 166, 'AAAA': 56, 'GGGG': 25, 'CCC': 76, 'TTTTT': 9, 'AAAAAAA': 1, 'TT': 302, 'TTT': 69, 'GGG': 113, 'CCCCCC': 2, 'CCCCC': 2, 'TTTTTT': 3, 'A': 1582, 'C': 1311, 'G': 1335, 'CC': 303, 'CCCC': 13, 'TTTT': 27, 'T': 1291, 'GG': 492, 'AAAAAA': 9, 'AAAAA': 18, 'GGGGG': 4}
[pseema@lab-0-8 virus_genomes]$
Tested by pycharm IDE
file 'data' looks like
>Ecoli_123
MSPWMKKVFLQCMPKLLMMRRTKY
>Pputida_167
IKKVLLIALTLSGAMGISREKRGL
>Saureus_436
MLSPSVAIKVQVLYIGKVRISQR
>Spyogenes_768
MFPGRTIGIMITASH
#############################To extract a particular fasta sequence from a list of fasta sequence in a file (suppose its data.txt)
############################from Bio import SeqIO input_file = r'data'record_id = 'Ecoli_123' record_dict = SeqIO.to_dict(SeqIO.parse(input_file, 'fasta')) record = record_dict[record_id] sequence = str(record.seq) print sequence MSPWMKKVFLQCMPKLLMMRRTKY
#To count the number of sequences in a fasta file
from Bio import SeqIO
filename = "data"count = 0for record in SeqIO.parse(filename, "fasta"):
count = count + 1print("There are " + str(count) + " records in the file named" + filename)
There are 4 records in the fasta file data
############################from Bio import SeqIO#To find the length of a single fasta record in the file
from Bio import SeqIO fasta_record = SeqIO.read("data", "fasta")print(fasta_record.id + " length " + str(len(fasta_record)))Ecoli_123 length 24
----------------
#To find the length of each fasta record in the file
filename = "data"for record in SeqIO.parse(filename, "fasta"): print('Record ' + record.id + ', length ' + str(len(record.seq)))
Record Ecoli_123, length 24 Record Pputida_167, length 24 Record Saureus_436, length 23 Record Spyogenes_768, length 15
############################
#To find protein sequences that don't start with amino acid Methionine
from Bio import SeqIO
filename = "data"bad = 0for record in SeqIO.parse(filename, "fasta"):
if not record.seq.startswith("M"):
bad = bad + 1 print(record.id + " starts " + record.seq[0])
print("Found " + str(bad) + " records in " + filename + " which don't start with M")
gi|16127995|ref|NP_414543.1| starts T
gi|16127995|ref|NP_414545.1| starts S
Found 2 records in data which don't start with M
#############################To find different motifs of A, T, C, G in the given fasta sequence
#! usr/bin/python
from itertools import groupby
fh = open('file.fasta')
groups = (x[1] for x in groupby(fh, lambda l: l[0] == ">"))
for g in groups:
header = next(g).strip()
seq = ''.join(s.strip() for s in next(groups))
counts = {}
for s in [''.join(g) for k, g in groupby(seq)]:
if s not in counts: counts[s] = 0
counts[s]+=1
print header
print counts
>gi|145413593|gb|AY858039.2| Dengue virus 3 strain den3_98, complete genome
{'AA': 491, 'AAA': 166, 'AAAA': 56, 'GGGG': 25, 'CCC': 76, 'TTTTT': 9, 'AAAAAAA': 1, 'TT': 302, 'TTT': 69, 'GGG': 113, 'CCCCCC': 2, 'CCCCC': 2, 'TTTTTT': 3, 'A': 1582, 'C': 1311, 'G': 1335, 'CC': 303, 'CCCC': 13, 'TTTT': 27, 'T': 1291, 'GG': 492, 'AAAAAA': 9, 'AAAAA': 18, 'GGGGG': 4}
[pseema@lab-0-8 virus_genomes]$
No comments:
Post a Comment