Tuesday, May 3, 2016

MY SCRIPT (2): Unique genes finding, their analysis, wrapper..

#Code to find out unique genes
#! /usr/bin
#Run as: sh unique_genes_finding.sh |& tee all_isolate_gene_profile

#mkdir /home/pseema/denovo_analysis/result_files/unique_genes
#find /home/pseema/denovo_analysis/result_files/*.only_header
while read strain;
do
while read isolate;
do

echo "#################Starting $isolate..####################"
#Extract all columns except column1
awk '{$1=""; print $0}' /home/pseema/denovo_analysis/result_files/$isolate.only_header > /home/pseema/denovo_analysis/result_files/$isolate.only_protein_name
echo "****Total number of proteins in $isolate: ******"
cat /home/pseema/denovo_analysis/result_files/$isolate.only_protein_name | wc -l
awk '!/hypothetical/' /home/pseema/denovo_analysis/result_files/$isolate.only_protein_name  >  /home/pseema/denovo_analysis/result_files/$isolate.only_functional_proteins
echo "******Number of non-hypothetical proteins in $isolate: *****"
cat  /home/pseema/denovo_analysis/result_files/$isolate.only_functional_proteins | wc -l
sort -u  /home/pseema/denovo_analysis/result_files/$isolate.only_functional_proteins > /home/pseema/denovo_analysis/result_files/$isolate.only_functional_proteins_sorted

#Shows common proteins to file 1 and file2 (option -12 or -21 can be used to achieve it)
echo "**Proteins common to $strain and $isolate: **"
comm -12  /home/pseema/denovo_analysis/result_files/$strain.only_functional_proteins_sorted  /home/pseema/denovo_analysis/result_files/$isolate.only_functional_proteins_sorted > /home/pseema/denovo_analysis/result_files/in_both.$strain.$isolate
cat /home/pseema/denovo_analysis/result_files/in_both.$strain.$isolate | wc -l
cat /home/pseema/denovo_analysis/result_files/in_both.$strain.$isolate
cp /home/pseema/denovo_analysis/result_files/in_both.$strain.$isolate  /home/pseema/denovo_analysis/result_files/unique_genes
echo "**Proteins common to $strain and $isolate done**"

#These proteins occur only in $strain (only column1)
echo "**Proteins unique to $strain: **"
comm -23  /home/pseema/denovo_analysis/result_files/$strain.only_functional_proteins_sorted  /home/pseema/denovo_analysis/result_files/$isolate.only_functional_proteins_sorted > /home/pseema/denovo_analysis/result_files/not_in.$isolate
cat /home/pseema/denovo_analysis/result_files/not_in.$isolate | wc -l
cat /home/pseema/denovo_analysis/result_files/not_in.$isolate
cp /home/pseema/denovo_analysis/result_files/not_in.$isolate  /home/pseema/denovo_analysis/result_files/unique_genes
echo "Unique protein search for $strain done"


#These proteins occur only in $isolate (only column2)
echo "**Proteins unique to $isolate: **"
comm -13  /home/pseema/denovo_analysis/result_files/$strain.only_functional_proteins_sorted  /home/pseema/denovo_analysis/result_files/$isolate.only_functional_proteins_sorted > /home/pseema/denovo_analysis/result_files/only_in.$isolate
cat /home/pseema/denovo_analysis/result_files/only_in.$isolate | wc -l
cat /home/pseema/denovo_analysis/result_files/only_in.$isolate
cp /home/pseema/denovo_analysis/result_files/only_in.$isolate  /home/pseema/denovo_analysis/result_files/unique_genes
echo "Unique protein search for $isolate done"

echo "********$isolate done********"
done < /home/pseema/denovo_analysis/input_files/isolate_list
#done < /home/pseema/denovo_analysis/input_files/IO_isolates
#done < /home/pseema/denovo_analysis/input_files/EAS_isolates
#done < /home/pseema/denovo_analysis/input_files/EAI_isolates
#done < /home/pseema/denovo_analysis/input_files/EAM_isolates

done < /home/pseema/denovo_analysis/input_files/strain_list
#done < /home/pseema/denovo_analysis/input_files/IO_isolates
#done < /home/pseema/denovo_analysis/input_files/EAS_isolates
#done < /home/pseema/denovo_analysis/input_files/EAI_isolates
#done < /home/pseema/denovo_analysis/input_files/EAM_isolates
-----------------------------------------------------
#! /usr/bin
#Code to analyze data for unique genes
#Execute as:  sh unique_genes_analysis.sh |& tee all_isolate_gene_analysis
#mkdir /home/pseema/denovo_analysis/result_files/unique_genes
#find *.matches_comm_12 |  wc -l
cat `find /home/pseema/denovo_analysis/result_files/unique_genes/in_both.*` > /home/pseema/denovo_analysis/result_files/unique_genes/all_isolates_common
echo "Common protein pool when the isolates were compared to each other..."
#cat /home/pseema/denovo_analysis/result_files/unique_genes/all_isolates_common | wc -l
uniq /home/pseema/denovo_analysis/result_files/unique_genes/all_isolates_common > /home/pseema/denovo_analysis/result_files/unique_genes/all_isolates_common_uniq
cat /home/pseema/denovo_analysis/result_files/unique_genes/all_isolates_common_uniq | wc -l
awk '!NF || !seen[$0]++' /home/pseema/denovo_analysis/result_files/unique_genes/all_isolates_common_uniq > /home/pseema/denovo_analysis/result_files/unique_genes/all_isolates_common_reduced
echo "Unique proteins in the common protein pool..."
cat /home/pseema/denovo_analysis/result_files/unique_genes/all_isolates_common_reduced | wc -l

#find *.matches_comm_23 |  wc -l
cat `find /home/pseema/denovo_analysis/result_files/unique_genes/not_in.*` > /home/pseema/denovo_analysis/result_files/unique_genes/all_isolates_only_column1
cat /home/pseema/denovo_analysis/result_files/unique_genes/all_isolates_only_column1 | wc -l
uniq /home/pseema/denovo_analysis/result_files/unique_genes/all_isolates_only_column1 > /home/pseema/denovo_analysis/result_files/unique_genes/all_isolates_only_column1_uniq
cat /home/pseema/denovo_analysis/result_files/unique_genes/all_isolates_only_column1_uniq | wc -l
awk '!NF || !seen[$0]++' /home/pseema/denovo_analysis/result_files/unique_genes/all_isolates_only_column1_uniq > /home/pseema/denovo_analysis/result_files/unique_genes/all_isolates_only_column1_uniq_reduced
cat /home/pseema/denovo_analysis/result_files/unique_genes/all_isolates_only_column1_uniq_reduced  | wc -l
#find *.matches_comm_13 |  wc -l
cat `find /home/pseema/denovo_analysis/result_files/unique_genes/only_in.*`> /home/pseema/denovo_analysis/result_files/unique_genes/all_isolates_only_column2
cat /home/pseema/denovo_analysis/result_files/unique_genes/all_isolates_only_column2 | wc -l
uniq /home/pseema/denovo_analysis/result_files/unique_genes/all_isolates_only_column2  > /home/pseema/denovo_analysis/result_files/unique_genes/all_isolates_only_column2_uniq
cat /home/pseema/denovo_analysis/result_files/unique_genes/all_isolates_only_column2_uniq | wc -l
awk '!NF || !seen[$0]++' /home/pseema/denovo_analysis/result_files/unique_genes/all_isolates_only_column2_uniq > /home/pseema/denovo_analysis/result_files/unique_genes/all_isolates_only_column2_uniq_reduced
cat /home/pseema/denovo_analysis/result_files/unique_genes/all_isolates_only_column2_uniq_reduced  | wc -l

#Find lines to a given pattern
awk '/Proteins unique to/'  all_isolate_gene_profile > /home/pseema/denovo_analysis/result_files/unique_genes/pattern_files

#Find lines next to a given pattern
awk 'f{print;f=0} /Proteins unique to/{f=1}' all_isolate_gene_profile > /home/pseema/denovo_analysis/result_files/unique_genes/next_lines

#Paste these two files side by side
paste -d' ' /home/pseema/denovo_analysis/result_files/unique_genes/pattern_files /home/pseema/denovo_analysis/result_files/unique_genes/next_lines > /home/pseema/denovo_analysis/result_files/unique_genes/isolate_diff_unique_genes

#Extract only column 4
awk '{print $4}' /home/pseema/denovo_analysis/result_files/unique_genes/isolate_diff_unique_genes > /home/pseema/denovo_analysis/result_files/unique_genes/isolate_diff_unique_genes_only_isolate
#find difference between two consecutive lines in the generated file
#Extract only odd number lines
awk 'NR%2==1' /home/pseema/denovo_analysis/result_files/unique_genes/isolate_diff_unique_genes_only_isolate  > /home/pseema/denovo_analysis/result_files/unique_genes/isolate_diff_unique_genes_only_isolate_only_odd

#Extract only even number lines
awk 'NR%2==0' /home/pseema/denovo_analysis/result_files/unique_genes/isolate_diff_unique_genes_only_isolate  > /home/pseema/denovo_analysis/result_files/unique_genes/isolate_diff_unique_genes_only_isolate_only_even

#Paste the extracted columns side by side
paste -d' ' /home/pseema/denovo_analysis/result_files/unique_genes/isolate_diff_unique_genes_only_isolate_only_odd /home/pseema/denovo_analysis/result_files/unique_genes/isolate_diff_unique_genes_only_isolate_only_even > /home/pseema/denovo_analysis/result_files/unique_genes/merged_columns_isolates

#Find difference between two consecutive lines in the generated file
#Extract only odd number lines
awk 'NR%2==1' /home/pseema/denovo_analysis/result_files/unique_genes/next_lines  > /home/pseema/denovo_analysis/result_files/unique_genes/only_odd

#Extract only even number lines
awk 'NR%2==0' /home/pseema/denovo_analysis/result_files/unique_genes/next_lines  > /home/pseema/denovo_analysis/result_files/unique_genes/only_even

#Paste the extracted columns side by side
paste -d' ' /home/pseema/denovo_analysis/result_files/unique_genes/only_odd /home/pseema/denovo_analysis/result_files/unique_genes/only_even > /home/pseema/denovo_analysis/result_files/unique_genes/merged_columns
#Find difference between two columns of the file
awk 'NF > 0 { print $0 "\t" ($1 - $2) }' /home/pseema/denovo_analysis/result_files/unique_genes/merged_columns > /home/pseema/denovo_analysis/result_files/unique_genes/diff_columns

#Paste the extracted columns side by side
paste -d' ' /home/pseema/denovo_analysis/result_files/unique_genes/merged_columns_isolates /home/pseema/denovo_analysis/result_files/unique_genes/diff_columns > /home/pseema/denovo_analysis/result_files/unique_genes/isolate_gene_diff

#Print content beetween two patterns
echo "*****Isolate-specific unique protein*****"
awk '/Proteins unique to/ {flag=1;next} /Unique protein search/{flag=0} flag {print}' all_isolate_gene_profile && awk '/Unique protein search for/' all_isolate_gene_profile

#To find the common genes in all the files
echo "The core genes are......"
for isolate
do
awk '!NF || !seen[$0]++' /home/pseema/denovo_analysis/result_files/$isolate.only_functional_proteins  > /home/pseema/denovo_analysis/result_files/unique_genes/indispensable_genes
done < /home/pseema/denovo_analysis/input_files/isolate_list

#To find the shared genes in all the files (it checks from folder to folder to find the shared genes)
echo "The shared genes are......"
#To get rid of backup files
#find . -name '*~' -exec rm {} \;
cat /home/pseema/denovo_analysis/result_files/*.only_functional_proteins_sorted | awk 'END {
  for (R in rec) {
    n = split(rec[R], t, "/")
    if (n > 1)
      dup[n] = dup[n] ? dup[n] RS sprintf("\t%-20s -->\t%s", rec[R], R) : \
        sprintf("\t%-20s -->\t%s", rec[R], R)
    }
  for (D in dup) {
    printf "records found in %d files:\n\n", D
    printf "%s\n\n", dup[D]
    } 
  }

  rec[$0] = rec[$0] ? rec[$0] "/" FILENAME : FILENAME
  }'
  -----------------------------------------
#! /usr/bin
#Wrappr to call all related  scripts
#Code to find out unique genes
sh unique_genes_finding.sh |& tee all_isolate_gene_profile
#sh unique_genes_finding.sh |& tee IO_isolate_gene_profile
#sh unique_genes_finding.sh |& tee EAS_isolate_gene_profile
#sh unique_genes_finding.sh |& tee EAI_isolate_gene_profile
#sh unique_genes_finding.sh |& tee EAM_isolate_gene_profile

#Code to analyze data for unique genes
sh unique_genes_analysis.sh |& tee all_isolate_gene_analysis

No comments:

Post a Comment