Exploring the choppy water of coding: Bioinformatics (2): Sequencers/NGS data usage...

Ten years ago, most genomes were still sequenced by the Sanger method

Sanger method: Selecting and growing clones of whole genome shotgun libraries, isolating sequencing templates, and performing the sequencing reactions, followed by electrophoresis on a bank of 96 or 384 well capillary machines. The output from such production lines was then automatically assembled and usually generated a high-quality draft of the genome.

Computational challenges rise from simple sample processing, to assembly, binning, and identification of species; further challenges are annotation of genes and of course assignment of function.

Sample/library prepration differs for each sequencers

Next Generation Sequence (NGS) data is high throughput

Information can be extracted from it through intensive manipulation and robust pipelines

To main approaches are Reference-based mapping or de novo assembly......

Read positions are manipulated to find SNPs, genotypes and indels

It suffers from low coverage depth at certain places which undermines confidence in sp calling.

Less than 10x depth is ambiguous

BAM file is map of data to reference

-------------------------------------

Illumina........

MiSeq: Small genome, amplicon, targeted gene sequencing

HiSeq: Big genome, exome, transcriptome

Sequenced using aired end mode with 36bp reads

The short read required high coverage

A larger number of contigs needed before genome closure

#Illumina SNP analysis
#Arrange Paired-end Reads
ls data/
for i in $(awk -F"L001" '{gsub("_$","",$1);print $1}' <(ls data/*.fastq.gz) | sort | uniq); do mkdir $(basename $i); mkdir $(basename $i)/Raw; cp $i*.fastq.gz $(basename $i)/Raw/. ; done
ls -d sample*
#Quality Trimming
for i in $(ls -d sample*/); do echo $i; cd $i; ls Raw; sickle pe -f Raw/*_R1_001.fastq.gz -r Raw/*_R2_001.fastq.gz -t sanger -o trimmed.$(basename ${i})_R1.fastq -p trimmed.$(basename ${i})_R2.fastq -s singles.$(basename ${i}).file.fastq -q 20 -l 10 ; cd ..; done
ls -d sample*/*.fastq
#Sequence Alignment and Quality Check
bwa index Cdiff078.fa
for i in $(ls -d sample*/); do echo $i; cd $i; bwa mem -M ../Cdiff078.fa trimmed.$(basename ${i})_R1.fastq trimmed.$(basename ${i})_R2.fastq | samtools view -hbS - | samtools sort -m 1000000000 - $(basename ${i});cd ..;done
#assess the quality of mapped reads by generating charts for GC Bias, Mean Quality by Cycle, and Quality Score Distribution
for i in $(ls -d sample*/); do echo $i; cd $i; java -Xmx2g -jar $(which CollectGcBiasMetrics.jar) R=../Cdiff078.fa I=$(basename ${i}).bam O=$(basename ${i})_GCBias.txt CHART=$(basename ${i})_GCBias.pdf ASSUME_SORTED=true; cd ..;done
for i in $(ls -d sample*/); do echo $i; cd $i; java -Xmx2g -jar $(which MeanQualityByCycle.jar) R=../Cdiff078.fa I=$(basename ${i}).bam O=$(basename ${i})_Qcycle.txt CHART=$(basename ${i})_Qcycle.pdf; cd ..;done
for i in $(ls -d sample*/); do echo $i; cd $i; java -Xmx2g -jar $(which QualityScoreDistribution.jar) R=../Cdiff078.fa I=$(basename ${i}).bam O=$(basename ${i})_Qdist.txt CHART=$(basename ${i})_Qdist.pdf; cd ..;done
#Mark and Remove Duplicates (by Picard)
for i in $(ls -d sample*/); do echo $i; cd $i; ls $(basename ${i}).bam ;java -jar $(which MarkDuplicates.jar) INPUT= $(basename ${i}).bam OUTPUT= $(basename ${i}).rmdup.bam METRICS_FILE= duplicateMatrix REMOVE_DUPLICATES=true;java -jar $(which AddOrReplaceReadGroups.jar) I=$(basename ${i}).rmdup.bam O=$(basename $i).rmdup.addgp.bam LB=whatever PL=illumina PU=whatever SM=whatever;samtools index $(basename ${i}).rmdup.addgp.bam; cd ..;done
#Indel Realignment
samtools faidx Cdiff078.fa
if [ ! -f "Cdiff078.dict" ]; then java -jar $(which CreateSequenceDictionary.jar) R=Cdiff078.fa O=Cdiff078.dict; fi
for i in $(ls -d sample*/); do echo $i;cd $i; java -Xmx2g -jar $(which GenomeAnalysisTK.jar) -T RealignerTargetCreator -R ../Cdiff078.fa -I $(basename ${i}).rmdup.addgp.bam -o $(basename ${i}).rmdup.addgp.intervals;java -Xmx4g -jar $(which GenomeAnalysisTK.jar) -I $(basename ${i}).rmdup.addgp.bam -R ../Cdiff078.fa -T IndelRealigner -targetIntervals $(basename ${i}).rmdup.addgp.intervals -o $(basename ${i}).realigned.bam;cd ..;done
#SNPs and Indels Detection
samtools mpileup -B -f Cdiff078.fa sample1/sample1.realigned.bam sample2/sample2.realigned.bam sample3/sample3.realigned.bam | java -jar VarScan.v2.3.7.jar mpileup2cns --min-coverage 2 --min-var-freq 0.8 --p-value 0.005 --variants --output-vcf > variants.vcf
head -30 variants.vcf
#Filter Recombination
for i in $(grep -v "#" variants.vcf| cut -f1|sort|uniq); do grep -i -e "$i" -e "#CHROM" variants.vcf> $i.vcf;done
#count SNP density
for i in $(ls Cdiff078_*.vcf); do echo $i; python contig_count_snp_density2.py -vcfs ${i} --win 200 ; done
#Collate all the snp density files together
for i in $(ls *snp_densitywin200.csv); do awk '{n=split(FILENAME,a,".");print a[1]"," $0}' $i | sed '1d' ;done> contigs_snps_density.csv
#plot the snp desnity using the following R script:
library(ggplot2)
library(reshape2)
library(colorspace)
density<- read.csv("contigs_snps_density.csv", header=FALSE, sep=",")
colnames(density)<- c("contigs","pos","count")

p1<- qplot(x=density$pos,y=density$count, color=density$contigs) +guides(col = guide_legend(title="Contigs",nrow = 18))
p <- p1+ labs(list(title="SNP density Plot", x= "Base Position", y="Density"))
ggsave("snp_density.pdf", plot=p, width =9)
#Filter the variants at a density level 5 to avoid false positives due to recombination.
python filter_variants.py --use-density --density 5 --window 200 variants.vcf > variants.filtered.vcf
#Number of SNPs after filtering are
vcftools --vcf variants.filtered.vcf --remove-indels
#Convert the filtered snps into a psudo-sequence using the python script 'vcf2phyloviz.py'. It gives the python vcf2phyloviz.py --metadata metadata.txt --output_prefix snps variants.filtered.vcf
cat snps_all_alleles.fasta
#Phylogenetic Tree Build using FastTree
FastTree -nt snps_all_alleles.fasta > snps.fastree.nwk
#Filter Recombination and Tree Build using Gubbins
vcftools --vcf variants.vcf --out snps_only.variants --remove-indels --recode --recode-INFO-all
python vcf2phyloviz.py -m metadata.txt -o snps_only.variants snps_only.variants.recode.vcf
cat snps_only.variants_all_alleles.fasta
export PYENV_ROOT="/home/opt/.pyenv"
export PATH="$PYENV_ROOT/bin:$PATH"
eval "$(pyenv init -)"
export PATH=/home/opt/fastml/programs/fastml:$PATH
export PATH=/home/opt/standard-RAxML:$PATH
#Now run Gubbins to generate phylogeny tree with FastTree (or RAxML optional).
run_gubbins.py --tree_builder fasttree --prefix gubbins snps_only.variants_all_alleles.fasta

#Visualize the tree in FigTree
---------------------------------------------------------------

Pacific BioSciences........

100x coverage possible with latest chemistry

longest read possible for PacBio is ~5000bp (read length average 1500 bp)

Steps of de novo assembly: circularization, error correction 1#PacBio Corrected Reads (PBcR) is a pipeline to correct errors in long reads

2#Celera assembler is used to assembled long reads (contigs)

3#Quiver polishes the assembly (corrects)

4#When assembly results in a circular genome, as confirmed by Dot plot, the start and end contain same sequence. The ends have low mapping score , so poor consensus, which makes clodsing the genome tricky.

Smart analysis AMOS package can address this issue.

polished assembly------->break in the middle

toAmos -s polished_assembly -o circularized.afg

minimus2 circularized

cat circularized.singletons.seq >> circularized.fasta

#Use the perl code to find dnaA in the genome to create a break at OriC

circ-perm.pl

Ion Torrent (Thermo-Fisher).......

Semi-conductor technology

Oxford Nnanopore MinION

MinION (longer read)

~10,000bp for Oxford Nanopore

#Issues with sequencers

Illumina systematic bias

Pacbio random bias

Drawbacks of sequencing

Many genomes are still in draft status, with too many contigs

#GC-rich region are hard to sequence

#A mutation can be a genetic marker and not a marker of resistance

#Commonly inspected genes and their surrounding regions

#Mutation can be in structural gene, promoter or intergenic regions

Difference between reference supporting read and variant supporting read determines if a base is SNP or not.

Quality score is required as each base has a tendency to be incorrect

Q10: 1 in 10 wrong : accuracy 90%

Q20: 1 in 100 wrong : accuracy 99%

Q30: 1 in 1000 wrong : accuracy 99.9%

Synonymy: If base change has not changed the amino acid (S), if changed the amino acid (NS)

Lab variant call pipeline

FASTQ file mapped to referene genome------->SAM/BAM file------->recalibrated BAM file---------

>mpileup file----varscan---> Consensus file (a vcf file)--------->snp file

BAM file needs to indexed by SAMTools. BAM can be seen by IGV.
Consensus file when filtered generates snp file

If variant read is larger by 3 or equal to 3, consider it SNP (command used awk, wc -l)

GATK (from Broad institute), Samtools and varscan for variant calling

Some genomic regions are difficult to sequence, so coverage depth is poor there

Coverage is genearted by Bamtools

VCF: Variant calling format

Tools to manipulate VCF files : samtools, bedtools, vcftools, GATK tools

vcf file has 19 columns.

$1: chrom

$2: pos

$3: ref

$4: mut

$5: read1

$6: read2

$19: var allele

Gene in minus strand needs to be reverse complemented

False Positive SNP: SNP called by NGS platform, not by gold standard Sanger

False Negative SNP: SNP called by gold standard Sanger, not by NGS platform

A SNP analysis pipeline..........

step 1: Quality score recalibration

java -jar file.jar -R file.fasta -I file.bam -O output_file

step 2: Covariates

java -jar file.jar -R file.fasta -Known_SNPs -I file.bam -T CountCovariates -recalFile recalibrated_file

step 3: Known_SNPs file generation (Use of MUMmer for rapid alignment of entire genome)

step 4: mpileup file generation

samtools mpileup -BI -f path_to_ref_fasta recalibrated.bam > file.mpileup

step 5: SNP and consensus calling

java -jar VarScan.jar pileup2snp file.mpileup --min-coverage 2 --min-var-freq 0.55 --p-value 0.10 > snp_file

java -jar VarScan.jar pileup2cns file.mpileup --min-coverage 2 --min-var-freq 0.55 --p-value 0.10 >consensus_file

step 6: Uploading SNP data to SQL database

LOAD DATA LOCAL INFILE 'path_to_file'

INTO TABLE TABLE_NAME (ID, pos, ref, mut,....)

step 7: Generate coverage file

bamtools coverage -in file.bam -out output_file

awk '{if ($2 >= 1000 && $2 <= 2000 ) {print $3}}' file

awk '{if ($2 >= 1000 && $2 <= 2000 ) {sum +=3; count +=1}} END {print sum, cont, sum/count}}' file

########

Indel calculation
IMF in vcf file is Maximum fraction of reads supporting an indel
#Code for using the aligner bbmap

bbmap.sh in1=reads1.fq in2=reads2.fq out=mapped.sam ref=ecoli.fa nodisk

----------------------------------------------

#Check human genome vcf file with polymorphisms

#Using tools and R packages
#get bwa
cd /root
wget -O bwa-0.7.10.tar.bz2 http://sourceforge.net/projects/bio-bwa/files/bwa-0.7.10.tar.bz2/download
#untar and compile bwa (via make)
tar xvfj bwa-0.7.10.tar.bz2
cd bwa-0.7.10
make
cp bwa /usr/local/bin
#install some tools
apt-get update
apt-get -y install samtools screen git curl gcc make g++ python-dev unzip \
default-jre pkg-config libncurses5-dev r-base-core \
r-cran-gplots python-matplotlib sysstat libcurl4-openssl-dev libxml2-dev
git clone https://github.com/schimar/ngs2014_popGen.git
cd ngs2014_popGen/var_call2/
#index the reference genome
bwa index ref_genome.fna
#map reads to the indexed reference genome
bwa aln ref_genome.fna read_file.fq > mapped_reads.sai
#create the SAM file
bwa samse ref_genome.fna mapped_reads.sai read_file.fq > mapped_reads.sam
#index the reference genome
samtools faidx ref_genome.fna
#convert from SAM to BAM
samtools view -b -S -o mapped_reads.bam mapped_reads.sam
#sort the BAM
samtools sort mapped_reads.bam mapped_reads.sorted
#index again, but now the sorted BAM file
samtools index mapped_reads.sorted.bam
#visualize the alignment
samtools tview mapped_reads.sorted.bam ref_genome.fna
#variant exploration with Bioconductor
R
source("http://bioconductor.org/biocLite.R")
biocLite()
biocLite("VariantAnnotation")
biocLite("SNPlocs.Hsapiens.dbSNP.20101109")
biocLite("BSgenome.Hsapiens.UCSC.hg19_1.3.1000")
#quality control
library(VariantAnnotation)
fl <- system.file("extdata", "chr22.vcf.gz", package="VariantAnnotation")
(hdr <- scanVcfHeader(fl))
info(hdr)[c("VT", "RSQ"),]
(vcf <- readVcf(fl, "hg19"))
head(rowData(vcf), 3)
rowData(vcf) <- renameSeqlevels(rowData(vcf), c("22"="ch22"))
#load the SNP database and discover whether our SNPs are in dbSNP
library(SNPlocs.Hsapiens.dbSNP.20101109)
destination <- tempfile()
pre <- FilterRules(list(isLowCoverageExomeSnp = function(x) {
grepl("LOWCOV,EXOME", x, fixed=TRUE)
}))
filt <- FilterRules(list(isSNP = function(x) info(x)$VT == "SNP"))
snpFilt <- filterVcf(fl, "hg19", destination, prefilters=pre, filters= filt)
vcf_filt <- readVcf(snpFilt, "hg19")
rowData(vcf)
rowData(vcf_filt)
#compare new snps with database
inDbSNP <- rownames(vcf) %in% rownames(vcf_filt)
table(inDbSNP)
metrics <- data.frame(inDbSNP = inDbSNP, RSQ = info(vcf)$RSQ)
#visualize
library(ggplot2)
ggplot(metrics, aes(RSQ, fill=inDbSNP)) +
geom_density(alpha=0.5) +
scale_x_continuous(name="MaCH / Thunder Imputation Quality") +
scale_y_continuous(name="Density") +

theme(legend.position="top")
-------------------------------------------------------------------------------
#To download human genome vcf files
http://uswest.ensembl.org/info/data/ftp/index.html

Exploring the choppy water of coding

Sunday, December 20, 2015

Bioinformatics (2): Sequencers/NGS data usage...

No comments:

Post a Comment