Monday, May 2, 2016

Tools to learn and work to do......

Alignment...
Alignment of sequencing reads to a reference genome is a core step in the analysis workflows for many high-throughput sequencing assays, including ChIP-Seq, RNA-seq, ribosome profiling and others.
Bowtie  uses an extremely economical data structure called the FM index to store the reference genome sequence and allows it to be searched rapidly. 
TopHat uses Bowtie as an alignment ‘engine’ 

Mauve?
#To run the Mauve GUI from within Terminal 
#Add directory with executables to Mauve path
cd Mauve/
ls
cd mauve_2.3.1/
./Mauve 

File
Align with progressive Mauve
Select the executable folder (by navigation
Mauve Console starts running (1-2 minutes for two full genomes)
 Viewing the alignment
Zoom in    Ctrl + UpScroll 
display left    Ctrl + LeftScroll 
display right    Ctrl + RightLarge 
left scroll    Shift + Ctrl + LeftLarge 
right scroll    Shift + Ctrl + Right
Tool ---------> Export ---------> Export SNPs   

Indel determination..
Whats the logic used to pull information from vcf file?


R PSI Blast
#Reversed Position Specific BLAST, or RPS BLAST, use at command line
#extract just these *.smp files from the large archive (cdd.tar.gz).
#run the formatrpsdb tool to build a database:
formatrpsdb -t Sigma.v001 -i Sigma.pn -o T -f 9.82 -n Sigma -S 100.0
#creates the eight files i.e. Sigma.aux, Sigma.loo, Sigma.phr, Sigma.pin, Sigma.psd, Sigma.psi, Sigma.psq and Sigma.rps which together make up the database.
#Compare
rpsblast -i rpoD.faa -d Sigma -e 0.00001
rpsblast -i rpoD.faa -d Sigma -e 0.00001 -o rpoD.txt
rpsblast -i rpoD.faa -d Sigma -e 0.00001 -m 7 -o rpoD.xml
#If comparing with Pfam database
rpsblast -i rpoD.faa -d Pfam -e 0.00001
#comparing entire genome with the Sigma database made earlier.
rpsblast -i NC_003197.faa -d Sigma -e 0.00001 -o NC_003197.txt
rpsblast -i NC_003197.faa -d Sigma -e 0.00001 -m 7 -o NC_003197.xml

#Analyzing RPS-BLAST output with Biopython
#For the smaller xml file
from Bio.Blast import NCBIXML
for record in NCBIXML.parse(open("rpoD.xml")) :
print "QUERY: %s" % record.query
for align in record.alignments :
print " MATCH: %s..." % align.title[:60]
for hsp in align.hsps :
print " HSP, e=%f, from position %i to %i" \
% (hsp.expect, hsp.query_start, hsp.query_end)
if hsp.align_length < 60 :
print " Query: %s" % hsp.query
print " Match: %s" % hsp.match
print " Sbjct: %s" % hsp.sbjct
else :
print " Query: %s..." % hsp.query[:57]
print " Match: %s..." % hsp.match[:57]
print " Sbjct: %s..." % hsp.sbjct[:57]
print "Done"


#For the large xml file
from Bio.Blast import NCBIXML
for record in NCBIXML.parse(open("NC_003197.xml")) :
    #We want to ignore any queries with no search results:
    if record.alignments :
        print "QUERY: %s..." % record.query[:60]
        for align in record.alignments :
            for hsp in align.hsps :
                print " %s HSP, e=%f, from position %i to %i" \
                % (align.hit_id, hsp.expect, hsp.query_start, hsp.query_end)
print "Done"
That should give you the following output - note there is only 



#Running RPS-BLAST from Biopython
#Adjust the file locations to match your own:
rpsblast_db = "C:\\Blast\\cdd\\Sigma"
rpsblast_exe = "C:\\Blast\\bin\\rpsblast.exe"

query_filename = "rpoD.faa"
#query_filename = "NC_003197.faa"

E_VALUE_THRESH = 0.00001 #Adjust the expectation cut-off here

from Bio.Blast import NCBIStandalone
output_handle, error_handle = NCBIStandalone.rpsblast(rpsblast_exe, \
rpsblast_db, query_filename, expectation=E_VALUE_THRESH)


from Bio.Blast import NCBIXML
for record in NCBIXML.parse(output_handle) :
    #We want to ignore any queries with no search results:
    if record.alignments :
        print "QUERY: %s..." % record.query[:60]
        for align in record.alignments :
            for hsp in align.hsps :
                print " %s HSP, e=%f, from position %i to %i" \
                % (align.hit_id, hsp.expect, hsp.query_start, hsp.query_end)
                assert hsp.expect <= E_VALUE_THRESH
print "Done"

No comments:

Post a Comment