Sunday, December 20, 2015

Bioinformatics (2): Sequencers/NGS data usage...

Ten years ago, most genomes were still sequenced by the Sanger method

Sanger method: Selecting and growing clones of whole genome shotgun libraries, isolating sequencing templates, and performing the sequencing reactions, followed by electrophoresis on a bank of 96 or 384 well capillary machines. The output from such production lines was then automatically assembled and usually generated a high-quality draft of the genome.
Computational challenges rise from simple sample processing, to assembly, binning, and identification of species; further challenges are annotation of genes and of course assignment of function.
Sample/library prepration  differs for each sequencers
Next Generation Sequence (NGS) data is high throughput
Information can be extracted from it through intensive manipulation and robust pipelines
To main approaches are Reference-based mapping or de novo assembly......
Read positions are manipulated to find SNPs, genotypes and indels
It suffers from low coverage depth at certain places which undermines confidence in sp calling.
Less than 10x depth is ambiguous
BAM file is map of data to reference
-------------------------------------
Illumina........
MiSeq: Small genome, amplicon, targeted gene sequencing
HiSeq: Big genome, exome, transcriptome
Sequenced using aired end mode with 36bp reads
The short read required high coverage
A larger number of contigs needed before genome closure

#Illumina SNP analysis
#Arrange Paired-end Reads
ls data/
for i in $(awk -F"L001" '{gsub("_$","",$1);print $1}' <(ls data/*.fastq.gz) | sort | uniq); do mkdir $(basename $i); mkdir $(basename $i)/Raw; cp $i*.fastq.gz $(basename $i)/Raw/. ; done
ls -d sample*
#Quality Trimming
for i in $(ls -d sample*/); do echo $i; cd $i; ls Raw; sickle pe -f Raw/*_R1_001.fastq.gz -r Raw/*_R2_001.fastq.gz -t sanger -o trimmed.$(basename ${i})_R1.fastq -p trimmed.$(basename ${i})_R2.fastq -s singles.$(basename ${i}).file.fastq -q 20 -l 10 ; cd ..; done
ls -d sample*/*.fastq
#Sequence Alignment and Quality Check
bwa index Cdiff078.fa 
for i in $(ls -d sample*/); do echo $i; cd $i; bwa mem -M ../Cdiff078.fa trimmed.$(basename ${i})_R1.fastq trimmed.$(basename ${i})_R2.fastq | samtools view -hbS - | samtools sort -m 1000000000  -  $(basename ${i});cd ..;done
#assess the quality of mapped reads by generating charts for GC Bias, Mean Quality by Cycle, and Quality Score Distribution
for i in $(ls -d sample*/); do echo $i; cd $i; java -Xmx2g -jar $(which CollectGcBiasMetrics.jar) R=../Cdiff078.fa I=$(basename ${i}).bam O=$(basename ${i})_GCBias.txt CHART=$(basename ${i})_GCBias.pdf ASSUME_SORTED=true; cd ..;done
for i in $(ls -d sample*/); do echo $i; cd $i; java -Xmx2g -jar $(which MeanQualityByCycle.jar) R=../Cdiff078.fa I=$(basename ${i}).bam O=$(basename ${i})_Qcycle.txt CHART=$(basename ${i})_Qcycle.pdf; cd ..;done
for i in $(ls -d sample*/); do echo $i; cd $i; java -Xmx2g -jar $(which QualityScoreDistribution.jar) R=../Cdiff078.fa I=$(basename ${i}).bam O=$(basename ${i})_Qdist.txt CHART=$(basename ${i})_Qdist.pdf; cd ..;done
#Mark and Remove Duplicates (by Picard)
for i in $(ls -d sample*/); do echo $i; cd $i; ls $(basename ${i}).bam ;java -jar $(which MarkDuplicates.jar) INPUT= $(basename ${i}).bam OUTPUT= $(basename ${i}).rmdup.bam METRICS_FILE= duplicateMatrix REMOVE_DUPLICATES=true;java -jar $(which AddOrReplaceReadGroups.jar) I=$(basename ${i}).rmdup.bam O=$(basename $i).rmdup.addgp.bam LB=whatever PL=illumina PU=whatever SM=whatever;samtools index $(basename ${i}).rmdup.addgp.bam; cd ..;done 
#Indel Realignment
samtools faidx Cdiff078.fa
if [ ! -f  "Cdiff078.dict" ]; then java -jar $(which CreateSequenceDictionary.jar) R=Cdiff078.fa O=Cdiff078.dict; fi
for i in $(ls -d sample*/); do echo $i;cd $i; java -Xmx2g -jar $(which GenomeAnalysisTK.jar) -T RealignerTargetCreator -R ../Cdiff078.fa -I $(basename ${i}).rmdup.addgp.bam -o $(basename ${i}).rmdup.addgp.intervals;java -Xmx4g -jar $(which GenomeAnalysisTK.jar) -I $(basename ${i}).rmdup.addgp.bam -R ../Cdiff078.fa -T IndelRealigner -targetIntervals $(basename ${i}).rmdup.addgp.intervals  -o $(basename ${i}).realigned.bam;cd ..;done
#SNPs and Indels Detection
samtools mpileup -B -f Cdiff078.fa  sample1/sample1.realigned.bam sample2/sample2.realigned.bam sample3/sample3.realigned.bam | java -jar VarScan.v2.3.7.jar mpileup2cns --min-coverage 2 --min-var-freq 0.8 --p-value 0.005 --variants --output-vcf > variants.vcf
head -30 variants.vcf 
#Filter Recombination
for i in $(grep -v "#" variants.vcf| cut -f1|sort|uniq); do grep -i -e "$i" -e "#CHROM" variants.vcf> $i.vcf;done
#count SNP density
for i in $(ls Cdiff078_*.vcf); do echo $i; python contig_count_snp_density2.py -vcfs ${i} --win 200 ; done
#Collate all the snp density files together
for i in $(ls *snp_densitywin200.csv); do awk '{n=split(FILENAME,a,".");print a[1]"," $0}' $i | sed '1d' ;done> contigs_snps_density.csv
#plot the snp desnity using the following R script:
library(ggplot2)
library(reshape2)
library(colorspace) 
density<- read.csv("contigs_snps_density.csv", header=FALSE, sep=",")
colnames(density)<- c("contigs","pos","count")

p1<- qplot(x=density$pos,y=density$count, color=density$contigs) +guides(col = guide_legend(title="Contigs",nrow = 18))
p <- p1+ labs(list(title="SNP density Plot", x= "Base Position", y="Density"))
ggsave("snp_density.pdf", plot=p, width =9)
#Filter the variants at a density level 5 to avoid false positives due to recombination.
python filter_variants.py --use-density --density 5 --window 200 variants.vcf > variants.filtered.vcf
#Number of SNPs after filtering are
vcftools --vcf variants.filtered.vcf --remove-indels
#Convert the filtered snps into a psudo-sequence using the python script 'vcf2phyloviz.py'. It gives the python vcf2phyloviz.py --metadata metadata.txt --output_prefix snps  variants.filtered.vcf
cat snps_all_alleles.fasta 
#Phylogenetic Tree Build using FastTree
FastTree -nt  snps_all_alleles.fasta > snps.fastree.nwk
#Filter Recombination and Tree Build using Gubbins
vcftools --vcf variants.vcf --out snps_only.variants --remove-indels --recode --recode-INFO-all
python vcf2phyloviz.py -m  metadata.txt -o snps_only.variants snps_only.variants.recode.vcf
cat snps_only.variants_all_alleles.fasta
export PYENV_ROOT="/home/opt/.pyenv"
export PATH="$PYENV_ROOT/bin:$PATH"
eval "$(pyenv init -)"
export PATH=/home/opt/fastml/programs/fastml:$PATH
export PATH=/home/opt/standard-RAxML:$PATH
#Now run Gubbins to generate phylogeny tree with FastTree (or RAxML optional).
run_gubbins.py --tree_builder fasttree --prefix gubbins snps_only.variants_all_alleles.fasta

#Visualize the tree in FigTree
---------------------------------------------------------------

Pacific BioSciences........
100x coverage possible with latest chemistry
longest read possible  for PacBio is ~5000bp (read length average 1500 bp)

Steps of de novo assembly: circularization, error correction 1#PacBio Corrected Reads (PBcR) is a pipeline to correct errors in long reads

2#Celera assembler is used to assembled long reads (contigs)
3#Quiver polishes the assembly (corrects)
4#When assembly results in a circular genome, as confirmed by Dot plot, the start and end contain same sequence. The ends have low mapping score , so poor consensus, which makes clodsing the genome tricky.
Smart analysis AMOS package can address this issue.

polished assembly------->break in the middle

toAmos -s polished_assembly -o circularized.afg
minimus2 circularized

cat circularized.singletons.seq >> circularized.fasta

#Use the perl code to find dnaA in the genome to create a break at OriC
circ-perm.pl

Ion Torrent (Thermo-Fisher).......
Semi-conductor technology

Oxford Nnanopore MinION
MinION (longer read)
~10,000bp for Oxford Nanopore  
#Issues with sequencers
Illumina systematic bias
Pacbio random bias
Drawbacks of sequencing
Many genomes are still in draft status, with too many contigs
#GC-rich region are hard to sequence
#A mutation can be a genetic marker and not a marker of resistance 
#Commonly inspected genes and their surrounding regions 
#Mutation can be in structural gene, promoter or intergenic regions
Difference between reference supporting read and variant supporting read determines if a base is SNP or not.
Quality score is required as each base has a tendency to be incorrect
Q10: 1 in 10 wrong : accuracy 90%
Q20: 1 in 100 wrong : accuracy 99%
Q30: 1 in 1000 wrong : accuracy 99.9%
Synonymy: If base change has not changed the amino acid (S), if changed the amino acid (NS)
Lab variant call pipeline


FASTQ file mapped to referene genome------->SAM/BAM file------->recalibrated BAM file---------

>mpileup file----varscan---> Consensus file (a vcf file)--------->snp file 

BAM file needs to indexed by SAMTools. BAM can be seen by IGV.
Consensus file when filtered generates snp file
If variant read is  larger by 3 or equal to 3, consider it SNP (command used awk, wc -l)

GATK (from Broad institute), Samtools and varscan for variant calling

Some genomic regions are difficult to sequence, so coverage depth is poor there
Coverage is genearted by Bamtools
VCF: Variant calling format
Tools to manipulate VCF files : samtools, bedtools, vcftools, GATK tools
vcf file has 19 columns. 
$1: chrom
$2: pos
$3: ref
$4: mut
$5: read1
$6: read2
$19: var allele
Gene in minus strand needs to be reverse complemented

False Positive SNP: SNP called by NGS platform, not by gold standard Sanger 
False Negative SNP: SNP called by gold standard Sanger, not by NGS platform 

A SNP analysis pipeline..........
step 1: Quality score recalibration
java  -jar file.jar  -R file.fasta  -I file.bam  -O output_file

step 2: Covariates
java  -jar file.jar  -R file.fasta  -Known_SNPs  -I file.bam   -T CountCovariates -recalFile recalibrated_file

step 3: Known_SNPs file generation (Use of MUMmer for rapid alignment of entire genome)


step 4: mpileup file generation
samtools mpileup -BI  -f path_to_ref_fasta  recalibrated.bam  > file.mpileup

step 5: SNP and consensus calling
java -jar  VarScan.jar  pileup2snp  file.mpileup  --min-coverage 2  --min-var-freq 0.55 --p-value 0.10  > snp_file

java -jar  VarScan.jar  pileup2cns  file.mpileup  --min-coverage 2  --min-var-freq 0.55 --p-value 0.10  >consensus_file

step 6: Uploading SNP data to SQL database
LOAD DATA LOCAL INFILE 'path_to_file'
INTO TABLE TABLE_NAME (ID, pos, ref, mut,....)

step 7: Generate coverage file
bamtools coverage -in file.bam -out  output_file
awk '{if ($2 >= 1000 && $2 <= 2000 ) {print $3}}' file
awk '{if ($2 >= 1000 && $2 <= 2000 ) {sum +=3; count +=1}} END {print sum, cont, sum/count}}' file
########
Indel calculation
IMF in vcf file is Maximum fraction of reads supporting an indel
#Code for using the aligner bbmap
bbmap.sh in1=reads1.fq in2=reads2.fq out=mapped.sam ref=ecoli.fa nodisk
----------------------------------------------
#Check human genome  vcf file with polymorphisms 
#Using tools and R packages
#get bwa
cd /root
wget -O bwa-0.7.10.tar.bz2 http://sourceforge.net/projects/bio-bwa/files/bwa-0.7.10.tar.bz2/download
#untar and compile bwa (via make)
tar xvfj bwa-0.7.10.tar.bz2
cd bwa-0.7.10
make
cp bwa /usr/local/bin
#install some tools
apt-get update
apt-get -y install samtools screen git curl gcc make g++ python-dev unzip \
      default-jre pkg-config libncurses5-dev r-base-core \
      r-cran-gplots python-matplotlib sysstat libcurl4-openssl-dev libxml2-dev
git clone https://github.com/schimar/ngs2014_popGen.git
cd ngs2014_popGen/var_call2/
#index the reference genome
bwa index ref_genome.fna
#map reads to the indexed reference genome
bwa aln ref_genome.fna read_file.fq > mapped_reads.sai
#create the SAM file
bwa samse ref_genome.fna mapped_reads.sai read_file.fq > mapped_reads.sam
#index the reference genome
samtools faidx ref_genome.fna
#convert from SAM to BAM
samtools view -b -S -o mapped_reads.bam mapped_reads.sam
#sort the BAM
samtools sort mapped_reads.bam mapped_reads.sorted
#index again, but now the sorted BAM file
samtools index mapped_reads.sorted.bam
#visualize the alignment
samtools tview mapped_reads.sorted.bam ref_genome.fna
#variant exploration with Bioconductor
R
source("http://bioconductor.org/biocLite.R")
biocLite()
biocLite("VariantAnnotation")
biocLite("SNPlocs.Hsapiens.dbSNP.20101109")
biocLite("BSgenome.Hsapiens.UCSC.hg19_1.3.1000")
#quality control
library(VariantAnnotation)
fl <- system.file("extdata", "chr22.vcf.gz", package="VariantAnnotation")
(hdr <- scanVcfHeader(fl))
info(hdr)[c("VT", "RSQ"),]
(vcf <- readVcf(fl, "hg19"))
head(rowData(vcf), 3)
rowData(vcf) <- renameSeqlevels(rowData(vcf), c("22"="ch22"))
#load the SNP database and discover whether our SNPs are in dbSNP
library(SNPlocs.Hsapiens.dbSNP.20101109)
destination <- tempfile()
pre <- FilterRules(list(isLowCoverageExomeSnp = function(x) {
grepl("LOWCOV,EXOME", x, fixed=TRUE)
}))
filt <- FilterRules(list(isSNP = function(x) info(x)$VT == "SNP"))
snpFilt <- filterVcf(fl, "hg19", destination, prefilters=pre, filters= filt)
vcf_filt <- readVcf(snpFilt, "hg19")
rowData(vcf)
rowData(vcf_filt)
#compare new snps with database
inDbSNP <- rownames(vcf) %in% rownames(vcf_filt)
table(inDbSNP)
metrics <- data.frame(inDbSNP = inDbSNP, RSQ = info(vcf)$RSQ)
#visualize
library(ggplot2)
ggplot(metrics, aes(RSQ, fill=inDbSNP)) +
geom_density(alpha=0.5) +
scale_x_continuous(name="MaCH / Thunder Imputation Quality") +
scale_y_continuous(name="Density") +

theme(legend.position="top")
-------------------------------------------------------------------------------
 #To download human genome vcf files
 http://uswest.ensembl.org/info/data/ftp/index.html

No comments:

Post a Comment