Friday, April 29, 2016

Python: Codes ........

Practical Computing for Biologists by Cliburn Chan
#To clone or copy a list in python
import copy
class Foo(object):
    def __init__(self, val):
         self.val = val
    def __repr__(self):
        return str(self.val)

foo = Foo(1)
a = ['foo', foo]
b = a[:]
c = list(a)
d = copy.copy(a)
e = copy.deepcopy(a)

# edit orignal list and instance
foo.val = 5

print('original: %r\n slice: %r\n list(): %r\n copy: %r\n deepcopy: %r'
      % (a, b, c, d, e))
#Creating a list of lists (sublists changed)
myList = [[[1] * 4] for n in range(3)]
#lst1 = [1]*4; lst = [lst1]*3
print myList
myList[0][0] = 5
print myList
#check if a word is a palindrome
word = raw_input("Enter a word: ")
if word == word[::-1]:
print "%s is a palindrome!" % word
print "%s is not palindrome" % word
#count the number of vowels in a word
name = raw_input("What’s your name? ")
num_vowels = 0
for vowel in ’aeiou’:
num_vowels += name.count(vowel)
print "Hello %s, there are %d vowels in your name." % (name, num_vowels)

#Substitution of regex
import re
find = r’(\d+)\s+(\w{3})[\w\,\.]*\s+(\d+)\sat\s(\d+):(\d+)\s+([-\d\.]+)\s+([-\d\.]+).*’
replace = r’\3\t\2.\t\1\t\4\t\5\t\6\t\7’
for line in open(’examples/Ch3observations.txt’):
newline = re.sub(find, replace, line)
print newline,
#Fitting the curve (Using 4 parameters, logistic equation). A plot will be generated
import numpy as np
import numpy.random as npr
import matplotlib.pyplot as plt
from scipy.optimize import leastsq
def logistic4(x, A, B, C, D):
"""4PL lgoistic equation."""
return ((A-D)/(1.0+((x/C)**B))) + D
def residuals(p, y, x):
"""Deviations of data from fitted 4PL curve"""
A,B,C,D = p
err = y-logistic4(x, A, B, C, D)
return err
def peval(x, p):
"""Evaluated value at x with current parameters."""
A,B,C,D = p
return logistic4(x, A, B, C, D)
# Make up some data for fitting and add noise
# In practice, y_meas would be read in from a file
x = np.linspace(0,20,20)
A,B,C,D = 0.5,2.5,8,7.3
y_true = logistic4(x, A, B, C, D)
y_meas = y_true + 0.2*npr.randn(len(x))
# Initial guess for parameters
p0 = [0, 1, 1, 1]
# Fit equation using least squares optimization
plsq = leastsq(residuals, p0, args=(y_meas, x))
# Plot results
plt.title(’Least-squares 4PL fit to noisy data’)
plt.legend([’Fit’, ’Noisy’, ’True’], loc=’upper left’)
for i, (param, actual, est) in enumerate(zip(’ABCD’, [A,B,C,D], plsq[0])):
plt.text(10, 3-i*0.5, ’%s = %.2f, est(%s) = %.2f’ % (param, actual, param, est))
#Simulation-based statistics (bootstrap and permuation resampling)
#Sampling without replacement
import numpy.random as npr
npr.normal(5, 1, 4)
npr.randint(1, 7, 10)
npr.uniform(1, 7, 10)
npr.binomial(n=10, p=0.2, size=(4,4))
x = [1,2,3,4,5,6]
#Sampling with replacement
import numpy as np
import numpy.random as npr
data = np.array([’tom’, ’jerry’, ’mickey’, ’minnie’, ’pocahontas’])
idx = npr.randint(0, len(data), (4,len(data)))
samples_with_replacement = data[idx]
#Bootstrapping ( higher order function)
import numpy as np
import numpy.random as npr
import pylab
def bootstrap(data, num_samples, statistic, alpha):
"""Returns bootstrap estimate of 100.0*(1-alpha) CI for statistic."""
n = len(data)
idx = npr.randint(0, n, (num_samples, n))
samples = x[idx]
stat = np.sort(statistic(samples, 1))
return (stat[int((alpha/2.0)*num_samples)],
if __name__ == ’__main__’:
# data of interest is bimodal and obviously not normal
x = np.concatenate([npr.normal(3, 1, 100), npr.normal(6, 2, 200)])
# find mean 95% CI and 100,000 bootstrap samples
low, high = bootstrap(x, 100000, np.mean, 0.05)
# make plots
pylab.hist(x, 50, histtype=’step’)
pylab.title(’Historgram of data’)
pylab.plot([-0.03,0.03], [np.mean(x), np.mean(x)], ’r’, linewidth=2)
pylab.scatter(0.1*(npr.random(len(x))-0.5), x)
pylab.plot([0.19,0.21], [low, low], ’r’, linewidth=2)
pylab.plot([0.19,0.21], [high, high], ’r’, linewidth=2)
pylab.plot([0.2,0.2], [low, high], ’r’, linewidth=2)
pylab.xlim([-0.2, 0.3])
pylab.title(’Bootstrap 95% CI for mean’)

low, high = bootstrap(x, 100000, np.std, 0.05)
#Permutation sampling (to find p-value)
import numpy as np
import numpy.random as npr
import pylab
def permutation_resampling(case, control, num_samples, statistic):
"""Returns p-value that statistic for case is different
from statistc for control."""
observed_diff = abs(statistic(case) - statistic(control))
num_case = len(case)
combined = np.concatenate([case, control])
diffs = []
for i in range(num_samples):
xs = npr.permutation(combined)
diff = np.mean(xs[:num_case]) - np.mean(xs[num_case:])
pval = (np.sum(diffs > observed_diff) +
np.sum(diffs < -observed_diff))/float(num_samples)
return pval, observed_diff, diffs
if __name__ == ’__main__’:
# make up some data
case = [94, 38, 23, 197, 99, 16, 141]
control = [52, 10, 40, 104, 51, 27, 146, 30, 46]
# find p-value by permutation resampling
pval, observed_diff, diffs = \
permutation_resampling(case, control, 10000, np.mean)
# make plots
pylab.title(’Empirical null distribution for differences in mean’)
pylab.hist(diffs, bins=100, histtype=’step’, normed=True)
pylab.axvline(observed_diff, c=’red’, label=’diff’)
pylab.axvline(-observed_diff, c=’green’, label=’-diff’)
pylab.text(60, 0.01, ’p = %.3f’ % pval, fontsize=16)
#Data visualization (for  exploratory data analysis)
 import numpy as np
import pylab
xs = np.loadtxt(’anscombe.txt’)
for i in range(4):
x = xs[:,i*2]
y = xs[:,i*2+1]
A = np.vstack([x, np.ones(len(x))]).T
m, c = np.linalg.lstsq(A, y)[0]
pylab.scatter(x, y)
pylab.plot(x, m*x+c, ’r’)

#Working with relational databases (connect, execute, iterate)
import sqlite3
con = sqlite3.connect(’pcfb.sqlite’)
r = con.execute(’select * from people’)
for i in r:
print i
r = con.execute(’select, from people as p join experiment as e where e.researcher ==
for i in r:
print ’Name: %s\n\tExperiment: %s’ % (i[0],i[1])

 #source code available from git repository. Need to self-compile
 #easy_install/pip installation 

Sunday, April 17, 2016

Allergy, inflammation, pesticides, pathogens, diseases, domains: Insights and my hypotheses........

Science is undergoing overhaul for good. Pre-established facts are being overthrown. 
 In nature, everything is related. This aspect ought to be focussed while hypothesizing.
#Allergy/inflammation/hormone insights
When similarity is high proteins have isozyme, isoforms. When less, they are different but homology remains.
Check homology between chitin (NAGA-NAGA), peptidoglycan (NAGA-NAMA), cuticle.
Pollen , virus domains and bacterial surface proteins are same.
Virus and cockroach allergen have same domain (kelch, jacalin)
Check similarity between gibberellin, auxin, insect pheromone, animal hormone,
Pathogenesis of pathogens, allergens and venoms are same.........just the latter two can't replicate, so victim might survive. Venom is high dose, so it can coagulate blood and kill.
All irritants provoke immune system and cause neural inflammation, leading to different diseases. Brain malaria, brain dengue, brain fever are nothing but body's immune system trying too hard to get rid of the pathogens and in turn causing harm to the host body itself.
Stress causes strange surface protein formation in bacteria which human antibody can't trap.

Immunodeficiency and autoimmunity is wrong classification. Its better to explain as immune activation-led inflammation.
Pastereulla sp. causes 50K antelope death, can it be by consumption of bad water?
Chymptrypsin can degrade galactouranase
Trypsin can cleave leather, collagen, keratin and can sequester co-factor metals like Ni, Cd.
Lathyrus plant causes neural paralysis by manipulating host protein via its lectins. Any pulse consumed too often will cause the same.
Any form of irritant disturbs host proteases. 
Most critical: Serine-, cysteine-, and metalloproteases (cause charge relay system)
So, lead to allergy, infectious disease, cancer, autoimmune disease

Bacterial ubiquitous protease: AAA proteases, degP (acts as chaperone as well as peptidase) or sortase (cys) ( substrates function as adhesins, internalins, blood clotting and immune evasion factors, and transporters for nutrients )
Bacterial pathogenic protease: clostripain (cys), collagenase (cause gas gangrene), botulinum neurotoxin, tetanus neurotoxin

Serine/threonine-protein kinase (pknA-B)
Ser/Thr phosphatase

Serine protease: chymotrypsin (S1), subtilisin (S8) 

To counter it protease inhibitors are given in the form of drugs or antibiotics. Chemotherapy is nothing but protease inhibitors and DNA gyrase inhibitors (to prevent DNA replication). Venoms, sea cucumber, ginseng etc. do the same. Peptide can be protease inhibitor. Myoactive neuropeptide NGIWY amide was  isolated from the holothurian. Sea urchin Strongylocentrotus purpuratus.
Protease inhibitors: glycoproteins or sulfated polysaccharides
These are present from virus to human. These inhibitors-coding  gene family have undergone duplication in higher animals. 
Serpin superfamily:
all-beta, immunoglobulin (Ig) fold:  (bacteria), chagasin and amoebiasin
Glycoproteins:  Kazal type (viral, fungal , termite, jelly fish, sea cucumber, human,  (SPINK)) or kunitz type (mostly in venoms e.g. spider, tick,  snake). Both types have been found in helminthes and bacteria

Protaese inhibition by: lock-and-key type,  conformational change and consequent kinetic trapping of an enzyme intermediate 

*Mice homozygous for SPINK gene results in postnatal lethality, growth retardation, dehydration, autophagic degeneration of acinar cells resulting in pancreas trophy, small intestine degeneration, and a small spleen.

Glycosylation of proteins occur in diabetes. It inactivates proteins that need to be activated. Protease can help in this scenario. Herbal drug cause glycosylation i.e is make activated protease inactive again. (most proteins are glycoproteins. Loss of glyco part might activate or inactivate the protein). Some hormones are glycoproteins. The allergen is cutting glyco moiety, so hormone can't act. Carbohydrate-degrading domains are known to play a role in fungal pathogenicity.
Non-pathogenicity is only a matter of time.
Person is not exposed to the allergen anymore, but the antibody, already formed is preventing the glycoproteins. Female menstrual cycle disturbance is an example of it. The hormone required for it are being destroyed by the antibody. That's what they call autoimmunity.
Breast infection...a type of its adipose tissue, pollutants are attracted here..
No one can underestimate the power of causes bloom in bare plant, it brings puberty in an innocent child...
Why, women face most allergies in late 20's and early 30's? May be because estrogen level is high at this phase. (LTP (tomato, pulses, fruits), insect, allergy).

Is wisdom teeth eruption related to nerve growth factor?
Inflammation is the cause of all diseases...personalized diet is needed for healthy living.
Catalytic site of protease contain or leach Ni, Mo, Cd or  Zn atoms. So, they become active and unleash immune response.
Due to disulfide arrangement, sulphur-rich proteins (cysteine protease as well) are stable against thermal and enzymatic degradation.
Sulfites in food can cause bronchial constriction, which can cause asthma (e.g preserved fries, namkeen, pizza)
Common acids in fruits: malic acid, tartaric acid, oxalic acid
Para-phenylenediamine (PPD) in ‘black henna’, hair dye, black rubber cause allergy
Polysaccharide (ligand)-----Dectin-1 (receptor) on APC (macrophage, dendritic cell)--- (Syk, NF-κB signaling, and cytokine release) --->TNF-α and IL-6 secretion
Dectin-1 blocking reagent: Laminarin 
Did pesticides or hormones caused protein misfolding (prions) and madcow disease in cattle?
In the 80's and 90's excessive use of pesticides in animal husbandry, led to their neural damage, corpse of which when devored by scavengers (vulture), the latter succumbed. Pesticides caused inflammation in cattle, that caused mad cow disease, causative agent of which was described as prions.....the misfolded proteins.....By devouring the carcass, vulture population almost vanished.
It is possible that the pesticides in food disrupted female reproductive system and caused birth defects, including autism in children.
Do food additives and other food chemicals are mimicing estrogen and causing early puberty in girls?
Dairy farm animals are advertised as free-range grazing cattle...but then mix preservatives in milk..
Pesticides manipulate our serine proteases.  Formaldehyde killed a young instantly.
Facts are emerging that rampant use of pesticides is affecting nervous system of farmers, leading to depression and suicidal tendencies.

Drug resistance in pathogens and cancer in human in outcome one stress.......drug abuse, pollution
deodorant use may be linked to diseases like cancer and Alzheimer's.
Surface antigens: repeat proteins of gram-positive cocci
Internalin: a repeat protein in Listeria
Transcriptional regulation (histidine kinases)
Chemotaxis (methyl-accepting proteins)
Catabolite repression (adenylate cyclases)
Modulation of enzyme activity (diguanylate cyclases and phosphodiesterases)PhoQ histidine kinase, essential for resistance to antimicrobial peptides is present in a variety of enteric pathogens. IS cause over-expression in some Mtb PhoQ.
hyperthermophilic bacteria (i.e. Aquifex and Theromotoga) and archea (e.g. Pyrococcus, Thermococcus, Methanothermus and Sulfolobus). Despite the small set of studied systems, it is clear that super-slow protein unfolding is a dominant strategy to allow these proteins to function at extreme temperatures. 
Repeat sequences are supposed to play a role in protein–protein interactions
Chloramphenicol acetyltransferase attaches an acetyl group to chloramphenicol, which prevents the drug from binding to ribosomes. It leads to drug resistance. In vitro culture-driven picture is a messed up picture.
Because of its excellent blood-brain barrier penetration (superior to any of the cephalosporins), chloramphenicol remains the first-choice treatment for staphylococcal brain abscesses.
Clostridium perfringens enterotoxin gene is on a transposable element
Iron-regulated heparin-binding hemagglutinin capacity of M. tuberculosis

Helicobacter pyroli: CagA to perturb a host cell signaling pathway, and leads to development of peptic ulcer 
Plasmodium falciparum: histidine-rich proteins that facilitate its survival inside red blood cells 
Plant-pathogenic oomycetes: the multifunctional elicitin molecules facilitate infection by triggering host tissue necrosis; serve as a sterol-carrying protein
The elicitin-encoding gene is highly expressed at body temperature than room temperature. Something like this must be happening in Mtb replication. Stimulation of clathrin-mediated endocytosis by the elicitin. 
##Intracellular pathogens
Mycobacterium tuberculosis
Coxiella burnetii
Legionella pneumophila
Brucella abortus

Rickettsia conorii
Diseases are geographical, largely based on diet and lifestyle . 
Asians have vitiligo, and diabetes. But no peanut allergy or Alzheimer disease. Also, cancer death was less, though pollution is causing cancer even in villages. Peanut is native to Indian subcontinent, so people their have evolved to metabolize it, so there is no issue of allergy or anaphylaxis. 
High instance s diabetes might be due to cereal-based diet. Eating too much sugar will activate carbohydrate cleaving enzymes. these enzymes will disturb glycoproteins. Signalling system goes awry. Moderation is the answer. Diabetes and hormonal disturbance is tied together. Thats why diabetic woman can't conceive. Diabetic people have low sexual drive, as the hormones for the stimulation are getting destroyed.
In Western  countries, instances of Alzheimer, Parkinson, multiple sclerosis, autism, cancer is high. Cheese and  alcohol seem to be the culprit. Cheese is serine and tyrosine rich, so might be manipulating serine, tyrosine protease and serine/tyrosine receptors in brain. Alcohol causes liver cirrhosis, so enzymes can't be formed.

Diseases: Heart: Acute myocardial infarction
Lungs: Chronic obstructive pulmonary disease (COPD)
Neural: Autism
Fungal: Aspergillosis
Bacteria: Tuberculosis
Cystic fibrosis: A disease of cells producing mucus, sweat and digestive juice (lungs, liver, pancreas)

Alzheimer's: Processed foods produce toxins, they cause inflammation,  build-up of plaques, impaired cognitive function. Processed foods such as white breads, pasta, processed meats and cheeses cause inflammation.

Autism: Causes of autism might be:Pesticides (other endocrine disruptors), Alcohol, Drugs (anti-depressants). Study approaches: Microbiome of faeces, Brain imaging, Behavioral study
Vitiligo: Caucasian families with co-segregation of vitiligo and Hashimoto thyroiditis
Pfeiffer syndrome is strongly associated with mutations of Fibroblast growth factor receptor 1 and 2 hypnic jerk is an involuntary twitch which occurs just as a person is beginning to fall asleep.
Disease-causing mutations: His to alanine; Phe to glycine
#mutagenesis results,  pharmacophore data
Increased circulating galaninL levels in serum contribute to the development of metabolic syndrome
Matrix molecules, i.e. collagens and proteoglycans. Defective hydroxylation of collagen cause scurvy.
Intermediate filaments (IF) 
type I: acidic cytokeratins
type II: basic cytokeratins
type III: vimentin, desmin, glial fibrillary acidic protein (GFAP), peripherin, and plasticin
type IV: neurofilaments L, H and M, alpha-internexin and nestin
type V: nuclear lamins A, B1, B2 and C
Mutations in long coiled-coil proteins causes diseases

Mutations in BRCA1 result in truncated proteins

95% of the cases of chronic myelogenous leukemia contain the Philadelphia chromosome, which is a translocation of part of chromosome 22 to chromosome 9. 
BRCA1: Hereditary breast cancer, hereditary ovarian cancer

 kinase activators, including epidermal growth factor (EGF) and the tumor promoting phorbol ester 12-O-tetradecanylphorbol-13-acetate (TPA)
Antibiotics affect host: G protein-coupled receptors,  intracellular calcium signals,  membrane cholesterol distribution.
Proton pump (H2) inhibitors can prevent acid production
Hydrocortisone prevents itching, eczema, psoriasis
Glucosamine and chondroitin sulfate is needed for joint health

Chromothripsis: Chromosomal rearrangement due to DNA damage in micronuclei
Most health issues start a vicious circle in us.
Pyrimidines: cytosine, thymine, and uracil
Urines: adenine, guanine
CG bond
AT bond, AU bond
Y: Any pyrimidine (C, T)
Out of 4 bases, A and C are methylated
Differential networks have recently been introduced as a powerful way to study the dynamic rewiring capabilities of an interactome in response to changing environmental conditions or stimuli. 
Paralogous genes are homologous genes that occur within one species and have diverged after a duplication event. 

Orthologous genes are homologous genes that diverged after a speciation event.
Protein expansion is primarily due to indels in intrinsically disordered regions
PCR amplification is one of the major sources of duplicates, which are usually introduced during sequencing library amplification.
Chromatin immunoprecipitation (ChIP) methodology: examine gene regulation in living cells
Plasmid isolation: CsCl/ethidium bromide gradient ultracentrifugation
Protein sizing: Western blot analysis
Localisation: immuno-histochemical staining
Aberrant or additional reactive bands: Southern blotting

Chimeric transcript detection: by RT-PCR
At high GC, coverage drops
Indel events most frequently occur in surface-exposed loops.
Human body contains about 1013 human cells and  about 1014 bacterial, fungal, and protozoan cells.
About 45% of human genome is transposable element.
Human genome size is 3,234.83Mb (almost 3 billion bases)
1.5% of the genome is CDS (20k-25K)
Each of the 23 chromosome has pseudogenes:  59 (chr 18) -1,130 (chr 1)
Confirmed protein range: 2,012 (in chromosome 1)- 45 (in   Y chromosome)
X chromosome codes for 815 genes
mtDNA has genes for only 13 proteins
Most variation s in chr 2 and 1
Longest chr: 1, 2
Shortest chr: mtDNA, 21, 22, 19, Y, 20
Large-scale sequencing efforts: 1000 Genomes, ExAC (Exome Aggregation Consortium), Scripps Wellderly, UK10K
Reference Variant Store (RVS) stores 400 million distinct variants observed in more than 80,000 human samples. (https://​rvs.​u.​hpc.​mssm.​edu/​)
Exhaustive annotation using tools such as snpEff, ANNOVAR, or VEP
Predictions of deleteriousness by SIFT, PolyPhen2, PROVEAN
Curated variantdatabases such as dbSNP, ClinVar, HGMD, OMIM, COSMIC
GEMINI: A software package  for exploring variation in personal genomes and family based genetic studies .
Well-studied disease genes and mutations
Breast cancer: BRCA2 (chr 13)
Cystic fibrosis: CFTR (chr 7)
Ctochrome b: MTCYB (mtDNA)
Hemoglobin: HBB (chr 11)

miRNA: Regulator of gene expression

snRNA: Small nuclear RNA (processes pre-mRNA and regulates transcription factors)
snoRNA: Small nucleolar RNA

After myosin and actin, titin is the third most abundant protein in human muscle

#Proteins/ Domains
Some proteins are fast-evolving 
Domains are common currencies of protein function that nature rearranges to create novel activities. (function and evolutionary aspect can be learnt from them).Domains do not generally appear de novo but my shuffling and rearrangement of existing domains. Cache_2 is predicted to originate from GAF-PAS fold. Domain-swap analysis revealed that the COOH-terminal leucine-rich repeat.
Flo11 flocculin belongs to a family of proteins involved in invasive growth, cell-cell adhesion, and mating, many of which can substitute for each other under abnormal conditions. Flo11 flocculin in yeast gives the cell a wide range of phenotypes (multicellular structures such as biofilms, flors, or filaments), depending on the strain and the environmental conditions. Does it happen in Mtb too? Sure. If its present in virus and cockroach it must be in bacteria too.
All recognition-related proteins are glycosylated (they bind to mannose or other carbohydrates). explains the cell-cell interaction capacity of FLO11-expressing cells.
Insoluble and inactive proteins are co-produced due to codon bias, protein folding, phosphorylation, glycosylation, mRNA stability and promoter strength.
Not only enzymes, even adhesins are pH dependent.
Also, the enzymes are hypothetical, as need cofactors to be active. Culture medium might be lacking in them.
Heparinase need substrate to be activated. (may not activate in vitro)
These are called transposases in the case of DNA elements and integrases in the case of the best-characterized RNA elements, the retroviruses and retrotransposons.
ATPase are cell-surface, membrane traversing proteins

coiled coil proteins: c-Fos,c-jun,tropomyosin

N-terminal amino acid of a protein is an important determinant of its half-life
CpG motifs are considered pathogen-associated molecular patterns (PAMPs). CpG PAMP is recognized by the pattern recognition receptor (PRR) Toll-Like Receptor 9 (TLR9), which is constitutively expressed only in B cells and plasmacytoid dendritic cells (pDCs) 

Protein families have arisen during evolution by gene duplication and divergence

Friday, April 15, 2016

Language: Perl rules.....

Turn on warnings, using use strict; use warnings;
Declare variables with 'my', to avoid collision.
3 datatypes: $ for a scalar,  @  for  an  array, %  for  a  hash
 arrays use square brackets [ ]; hashes use curly braces { }
 array  variable  @ARGV  contains   arguments
special variable $0 (a zero) has the name of the program
If no arguments are given, print a USAGE statement and exit.
passing arguments into subroutines is called 'pass  by  reference '
The push adds an element to the end of the @i array
The shift removes the first element from the  @j  array
Using  randomization,  it's  possible  to  simulate  and  investigate  the  mechanisms  of
mutations in DNA
Call srand to seed the random number generator before you call this function.
Nested braces need to be read and evaluated from the inside out.
Perl modules/libraries are used to load in subroutines (module has .pm extension)
use BeginPerlBioinfo;
$&  is  a  special variable that's set after a successful regular-expression match

Language: Perl (from James Tisdall book).....(2)...

1# Read a DNA FASTA file, translate to protein, and format output (here make 25-character-long lines) 

use strict;
use warnings;
use BeginPerlBioinfo;  

my @file_data = (  );
my $dna = '';
my $protein = '';

@file_data = get_file_data("sample.dna");
$dna = extract_sequence_from_fasta_data(@file_data);
$protein = dna2peptide($dna);
print_sequence($protein, 25);


 2# Translate a DNA sequence in all three reading frames  (2 are in plus, 1 in minus strand)
use strict;
use warnings;
use BeginPerlBioinfo;

my @file_data = (  );
my $dna = '';
my $revcom = '';
my $protein = '';
@file_data = get_file_data("sample.dna");
$dna = extract_sequence_from_fasta_data(@file_data);

print "\n -------Reading Frame 1--------\n\n";
$protein = translate_frame($dna, 1);
print_sequence($protein, 50);

print "\n -------Reading Frame 2--------\n\n";
$protein = translate_frame($dna, 2);
print_sequence($protein, 50);

$revcom = revcom($dna);
print "\n -------Reading Frame 3--------\n\n";
$protein = translate_frame($revcom, 3);
print_sequence($protein, 50);

#subroutine (2 of them i.e revcom, translate_frame)
sub revcom { 
    my($dna) = @_; 
    my($revcom) = reverse($dna); 
    $revcom =~ tr/ACGTacgt/TGCAtgca/; 
    return $revcom;


sub translate_frame {
    my($seq, $start, $end) = @_;
    my $protein;
    unless($end) {
        $end = length($seq);
return dna2peptide ( substr ( $seq, $start - 1,
$end -$start + 1) );
3#Find motifs/regex
if( $dna =~ /CT[CGT]ACG/ ) {
    print "I found the motif!!\n";

# Make restriction map from user queries 
use strict;
use warnings;
use BeginPerlBioinfo;

my %rebase_hash = (  );
my @file_data = (  );
my $query = '';
my $dna = '';
my $recognition_site = '';
my $regexp = '';
my @locations = (  );

@file_data = get_file_data("sample.dna");
$dna = extract_sequence_from_fasta_data(@file_data);
%rebase_hash = parseREBASE('bionet');

do {
    print "Search for what restriction site for (or quit)?: ";

    $query = <STDIN>;
chomp $query;
    # Exit if empty query
    if ($query =~ /^\s*$/ ) {

    if ( exists $rebase_hash{$query} ) {
        ($recognition_site, $regexp) = split ( " ", $rebase_hash{$query});    
        @locations = match_positions($regexp, $dna);
        if (@locations) {
            print "Searching for $query $recognition_site $regexp\n";
 print "A restriction site for $query at locations:\n";
            print join(" ", @locations), "\n";
        } else {
            print "A restriction site for $query is not in the DNA:\n";
    print "\n";
} until ( $query =~ /quit/ );


# Subroutine  
sub match_positions {
    my($regexp, $sequence) = @_;
    use strict;
    use BeginPerlBioinfo;

    my @positions = (  );

    while ( $sequence =~ /$regexp/ig ) {
 push ( @positions, pos($sequence) - length($&) + 1);
     return @positions;


4#Extract annotation and sequence from GenBank file  
use strict;
use warnings;
use BeginPerlBioinfo;  
my $sequence = '';
my $filename = '';

parse1(\@annotation, \$sequence, $filename);
print @annotation;
print_sequence($sequence, 50);

# Subroutine 
sub parse1 {
    my($annotation, $dna, $filename) = @_;  
    my $in_sequence = 0;
    my @GenBankFile = (  );    

    @GenBankFile = get_file_data($filename);    
    foreach my $line (@GenBankFile) {
        if( $line =~ /^\/\/\n/ ) {
        } elsif( $in_sequence) {
            $$dna .= $line;
        } elsif ( $line =~ /^ORIGIN/ ) {
            $in_sequence = 1;
        } else{
            push( @$annotation, $line);
    $$dna =~ s/[\s0-9]//g;


Regular expressions (regexes).........

# Match something that looks like an IP address

# Test if a number is in range 0-255

# Match an IP address
my $ip_part = qr|([0-9]|[0-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])|;
if ($ip =~ /^($ip_part\.){3}$ip_part$/) {
 say "valid ip";

# Check if the string looks like an email address

# Check if the string is a decimal number

# Check if the string is a hexadecimal number

# Check if the string is an octal number

# Check if the string is binary

# Check if a word appears twice in the string

# Increase all numbers by one in the string
$str =~ s/(\d+)/$1+1/ge

# Extract HTTP User-Agent string from the HTTP headers
/^User-Agent: (.+)$/

# Match printable ASCII characters
/[ -~]/

# Match unprintable ASCII characters
/[^ -~]/

# Match text between two HTML tags

# Replace all <b> tags with <strong>
$html =~ s|<(/)?b>|<$1strong>|g

# Extract all matches from a regular expression
my @matches = $text =~ /regex/g;

Thursday, April 14, 2016

Language: Perl (from James Tisdall book).....(1)...

1#Iterative printing (reading protein sequence data from a file ) 
#!/usr/bin/perl -w
# Reading protein sequence data from a file, take 2 
# The filename of the file containing the protein sequence 
$proteinfilename = 'file.pep';
# open the file, associate "filehandle" with it.  # PROTEINFILE for readability. 
open(PROTEINFILE, $proteinfilename);
# read and print first line 
$protein = <PROTEINFILE>;
print "\nHere is the first line of the protein file:\n\n";
print $protein;
# read and print Second line 
$protein = <PROTEINFILE>;
print "\nHere is the second line of the protein file:\n\n";
print $protein;
read and print third line 
$protein = <PROTEINFILE>;
print "\nHere is the third line of the protein file:\n\n";
print $protein;
#close the file. 
2# Conditional if-elsif-else  (to find the motif)
#!/usr/bin/perl -w
$word = 'MIFGRKL';
# if-elsif-else conditionals 
if($word eq 'QRRTVD') {
     print "QRRTVD\n";
} elsif($word eq 'ERHDESE') {
     print "ERHDESE\n";
} elsif ( $word eq 'MIFGRKL' ) {
     print "MIFGRKL--the required word!\n";
} else {
     print "Is \"$word\" a peptide? This program is not
3#  Loop (reading protein sequence data from a file)
#!/usr/bin/perl -w 
$proteinfilename = 'file.pep';

#If failed to open print
unless ( open(PROTEINFILE, $proteinfilename) ) {
    print "Could not open file $proteinfilename!\n";
# Read using a "while" loop, then print
while( $protein = <PROTEINFILE> ) {
    print "  ######  Here is the next line of the file:\n";
    print $protein;
# Close the file

4#  Conditional do-until, if-else and array usage (motif finding)
#!/usr/bin/perl -w
# Ask the user to mention filename
print "Please type the filename of the protein sequence
data: ";
$proteinfilename = <STDIN>;
# Remove newline from the filename
chomp $proteinfilename;
# open the file, or exit 

unless ( open(PROTEINFILE, $proteinfilename) ) {
print "Cannot open file \"$proteinfilename\"\n\n";
# Read the protein sequence data from the file, and store in array variable @protein 
@protein = <PROTEINFILE>;
# Close the file
# Put the protein sequence data (array) into a single string (into a single scalar variable)
$protein = join( '', @protein);
# Remove whitespace 
$protein =~ s/\s//g;
# Search for the motif, print if found, if not exit
do {
    print "Enter a motif to search for: ";
    $motif = <STDIN>;
    # Remove the newline at the end of $motif  
    chomp $motif;
    # Look for the motif  (pattern matching)
    if ( $protein =~ /$motif/ ) {
        print "I found it!\n\n";

    } else {
  print "I couldn\'t find it.\n\n";     }
# exit on an empty user input  (the regex /^\s*$/  indicates space)
} until ( $motif =~ /^\s*$/ );
# exit the program

5#  Scalar variable-initialize-split-join- foreach loop (nucleotide frequency finding)
#It can be done using for loop or while loop as well
#!/usr/bin/perl -w 
# Determining frequency of nucleotides  
# Get the name of the file from user
print "Please type the filename of the DNA sequence data: ";
$dna_filename = <STDIN>;
# Remove the newline from file
chomp $dna_filename;
# open the file, or exit 
unless ( open(DNAFILE, $dna_filename) ) {
    print "Cannot open file \"$dna_filename\"\n\n";
# Read the  data  and store it in array variable @DNA 
# Close the file 
close DNAFILE;
# put the DNA sequence data into a single string (using join)
$DNA = join( '', @DNA);
# Remove whitespace 
$DNA =~ s/\s//g;

# Explore the  array variable @DNA to find each base
@DNA = split( '', $DNA );
# Initialize the counts (with scalar variables)
$count_of_A = 0;
$count_of_C = 0;
$count_of_G = 0;
$count_of_T = 0;
$errors     = 0;
# Count increment of the 4 nucleotides (A,T, C, G)  
foreach $base (@DNA) {
    if     ( $base eq 'A' ) {
    } elsif ( $base eq 'C' ) {
    } elsif ( $base eq 'G' ) {
    } elsif ( $base eq 'T' ) {
    } else {
        print "Error - I don\'t recognize this
base: $base\n";
# print the results 
print "A = $count_of_A\n";
print "C = $count_of_C\n";
print "G = $count_of_G\n";
print "T = $count_of_T\n";
print "errors = $errors\n";
# exit the program 
6#  Number manipulation (print, add, concatenate)
#!/usr/bin/perl -w  
$num = 1234;
$str = '1234';
# print the variables
print $num, " ", $str, "\n";
# add the variables as numbers 
$num_or_str = $num + $str; print $num_or_str, "\n";
# concatenate the variables as strings 
$num_or_str = $num . $str;
print $num_or_str, "\n";
#A subroutine is called by typing its name, followed by a parenthesized list of arguments
#A keyword sub, followed by a block, closed in braces. Exit comes before, subroutine call.
7#   Using a subroutine to append ACGT to DNA 
#!/usr/bin/perl -w 
# A program with a subroutine to append ACGT to DNA 
# The DNA  seq contained in the scalar variable
# Call to the subroutine "addGGTA". 
# The argument being passed in is $dna; the result is saved  in $new_dna_seq 
$new_dna_seq  = addACGT($dna);
print "Added GGTA to $dna and got $new_dna_seq \n\n";

# The subroutine "addGGTA"  (my construct is essential)
sub addGGTA {
    my($dna) = @_;
    $dna .= 'GGTA';
    return $dna;

7#   Using a subroutine to append ACGT to DNA 
#!/usr/bin/perl -w 

$dna = 'AAAAA';
$result = A_to_T($dna);
print "Changed all the A's in $dna to T's and got $result\n\n";

sub A_to_T {
    my($input) = @_;
    my($dna) = $input;
    $dna =~ s/A/T/g;
    return $dna;
8#   Counting the G's in some DNA 
#!/usr/bin/perl -w
use strict;
my($USAGE) = "$0 DNA\n\n";
unless(@ARGV) {
    print $USAGE;
my($dna) = $ARGV[0];
my($num_of_Gs) = countG ( $dna );
print "\nThe DNA $dna has $num_of_Gs G\'s in it!\n\n";

 sub countG {
    my($dna) = @_;
    my($count) = 0;
$count = ( $dna =~ tr/Gg//);
    return $count;

}The  other  "take-home  message"
9# Pass-by-reference
use strict;
use warnings;
my @i = ('1', '2', '3');
my @j = ('a', 'b', 'c');
print "In main program before calling subroutine: i = " .  "@i\n";
print "In main program before calling subroutine: j = " .  "@j\n";
reference_sub(\@i, \@j);
print "In main program after calling subroutine: i = " . "@i\n";
print "In main program after calling subroutine: j = " .  "@j\n";

sub reference_sub {
    my($i, $j) = @_;
    print "In subroutine : i = " . "@$i\n";
    print "In subroutine : j = " . "@$j\n";
    push(@$i, '4');

10# Randomization (story telling by assembling phrases)
use strict;
use warnings;
# Declare the variables 
my $count;
my $input;
my $number;
my $sentence;
my $story;
# Here are the arrays of parts of sentences: 
my @nouns = (
'Robin Hood',
'Joe and Moe',

my @verbs = (
'ran to',
'giggled with',
'put hot sauce into the orange juice of',
'sang stupid songs with',
'jumped with',
my @prepositions = (
'at the store',
'over the rainbow',
'just for the fun of it',

'at the beach',
'before dinner',
'in New York City',
'in a dream',
'around the world',
# Seed the random number generator. 
# time|$$ combines the current time with current process id  to form random seed (srand)
# The do-until loop composes six-sentence, until the user types "quit".
do {
    # (Re)set $story to the empty string each time through the loop

    $story = '';  
# Make 6 sentences per story.
    for ($count = 0; $count < 6; $count++) {  
        $sentence   = $nouns[int(rand(scalar @nouns))]                     . " "
                    . $verbs[int(rand(scalar @verbs))]                      . " "
                    . $nouns[int(rand(scalar @nouns))]                      . " "
                    . $prepositions[int(rand(scalar  @prepositions))]                      . '. ';
        $story .= $sentence;
 print "\n",$story,"\n";
print "\nType \"quit\" to quit, or press Enter to continue: ";
    $input = <STDIN>;
}  until($input =~ /^\s*q/i);

#Randomization ( Selecting a random position in a string )
#!/usr/bin/perl -w
for (my $i=0 ; $i < 20 ; ++$i ) {
    print randomposition($dna), " ";
print "\n";


sub randomposition {
    my($string) = @_;
    return int rand length $string;

#Randomization ( Choosing a random base from a DNA)
#!/usr/bin/perl -w
my @nucleotides = ('A', 'T', 'C', 'G');
for (my $i=0 ; $i < 20 ; ++$i ) {
 print randomnucleotide(@nucleotides), " ";
 print "\n";

sub randomnucleotide {
    my(@nucs) = @_;
    return $nucs[rand @nucs];
#Randomization ( Mutating a base in a DNA seq)
use strict;
use warnings; 
my $i; 
my $mutant;
$mutant = mutate($DNA); 
print "\nMutate DNA\n\n"; 
print "\nOriginal DNA is:\n\n";
print "$DNA\n"; 
print "\Mutant DNA is:\n\n";
print "$mutant\n";

print "\nHere are 10 more successive mutations:\n\n"; 
for ($i=0 ; $i < 10 ; ++$i) {
    $mutant = mutate($mutant);
    print "$mutant\n";


#Use of 4 subroutines (sub mutate, sub randomelement, sub randomnucleotide, sub randomposition)
sub mutate { 
    my($dna) = @_; 
    my(@nucleotides) = ('A', 'C', 'G', 'T'); 
    my($position) = randomposition($dna); 
 do {
        $newbase = randomnucleotide(@nucleotides); 
 }until ( $newbase ne substr($dna, $position,1) );
    return $dna;


sub randomelement { 
    my(@array) = @_; 
    return $array[rand @array];


sub randomnucleotide { 
    my(@nucleotides) = ('A', 'C', 'G', 'T');
    return randomelement(@nucleotides);


sub randomposition { 
    my($string) = @_; 
    return int rand length $string;


#Randomization ( Generating a random DNA)
use strict;
use warnings;

my $size_of_set = 10;
my $maximum_length = 25;
my $minimum_length = 12;

my @random_DNA = (  );

@random_DNA = make_random_DNA_set( $minimum_length,

$maximum_length, $size_of_set );
print "The array of $size_of_set randomly generated DNA sequences\n";
print "  with lengths between $minimum_length and
foreach my $dna (@random_DNA) {
    print "$dna\n";
print "\n";


# Subroutines 
sub make_random_DNA_set {
    my($minimum_length,  $maximum_length,  $size_of_set)  =  @_;
    my $length;
    my $dna;
    my @set;
    for (my $i = 0; $i < $size_of_set ; ++$i) {
        $length = randomlength ($minimum_length, $maximum_length);
         $dna = make_random_DNA ( $length );
         push( @set, $dna );
    return @set;

sub randomlength {
    my($minlength, $maxlength) = @_;
     return ( int(rand($maxlength - $minlength + 1)) + $minlength );

sub make_random_DNA {
     my($length) = @_;
    my $dna;
    for (my $i=0 ; $i < $length ; ++$i) {
        $dna .= randomnucleotide(  );
    return $dna;


sub randomnucleotide {
    my(@nucleotides) = ('A', 'C', 'G', 'T');
    return randomelement(@nucleotides);

sub randomelement {
    my(@array) = @_;
    return $array[rand @array];


#Randomization ( Calculating % identity between  DNA sequences)
use strict;
use warnings;

my $percent;
my @percentages;
my $result;
my @random_DNA = (  );


@random_DNA = make_random_DNA_set( 10, 10, 10 );
for (my $k = 0 ; $k < scalar @random_DNA - 1 ; ++$k) {
    for (my $i = ($k + 1) ; $i < scalar @random_DNA ; ++$i)

        $percent = matching_percentage($random_DNA[$k], $random_DNA[$i]);

        push(@percentages, $percent);

$result = 0; 
foreach $percent (@percentages) {
  $result += $percent;

$result = $result / scalar(@percentages);
$result = int ($result * 100);

print "In this run of the experiment, the average
percentage of \n";
print "matching positions is $result%\n\n"; 


# Subroutines 
sub matching_percentage { 
    my($string1, $string2) = @_;
    my($length) = length($string1);
    my($count) = 0;

    for ($position=0; $position < $length ; ++$position) {
        if(substr($string1,$position,1) eq
substr($string2,$position,1)) {
    return $count / $length;


sub make_random_DNA_set {
    my($minimum_length,  $maximum_length,  $size_of_set)  =  @_;
    my $length;
    my $dna;

    my @set;
    for (my $i = 0; $i < $size_of_set ; ++$i) {
        $length = randomlength ($minimum_length, $maximum_length);
        $dna = make_random_DNA ( $length );
        push( @set, $dna );
    return @set;

sub randomlength {
    my($minlength, $maxlength) = @_; 
    return ( int(rand($maxlength - $minlength + 1)) + $minlength );

sub make_random_DNA {
    my($length) = @_; 
    my $dna; 
    for (my $i=0 ; $i < $length ; ++$i) {
        $dna .= randomnucleotide(  );
    return $dna;

sub randomnucleotide { 
    my(@nucleotides) = ('A', 'C', 'G', 'T');    
    return randomelement(@nucleotides);

sub randomelement { 
    my(@array) = @_; 

    return $array[rand @array];

Sunday, April 3, 2016

Blog tricks: Conversion to pdf..

 Convert your blog to a PDF and a Word file

  1. 1. First export your blog e.g. a Blogger blog Go to your blog’s dashboard and look for Settings/Other
  2. 2. Download the .xml file to your hard drive To do this go to Dashboard/Settings/ Other/Export blog/Download blog
  3. 3. Keep the file stored on your hard drive – it is quite safe!
  4. 4. The .xml file looks quite unreadable!
  5. 5. Open Instructions are provided for three blogging platforms
  6. 6. Choose your settings in BlogBooker and get started Import your .xml file Create your BlogBook!
  7. 7. These are the options I chose
  8. 8. Your PDF is done! Before you know it is available as a PDF! Click here.
  9. 9. Save your PDF
  10. 10. Edit your PDF by converting it to a Worddocument with Free PDF to Word Converter
  11. 11. Download it and install it on your computer to use
  12. 12. You can now easily edit your pages!