Exploring the choppy water of coding: Shell (2): awk (1) .......

Version information of awk
awk --version
------------------------
PRINT
#Prints current file name
awk 'END {print FILENAME}' file
#Prints entire content of the file
awk '{ print }' file
awk -F "\t" '{ print $0 }' file
# Print all lines of a file
awk '{print;}' file
#Print lines with a given pattern
awk '/pattern/' file
#Print lines without a given pattern
awk '!/pattern/' file
#Prints the whole table with legends of each column
awk 'BEGIN{printf "No.\tName\tCity\tAge\n"} {print}' file
#print CONDITION ? PRINT_IF_TRUE_TEXT : PRINT_IF_FALSE_TEXT (to find length of sequence in column 2)
awk '{print (length($2)>12) ? $0">12" : $0"<=12";}' file
#Print $1 three times (using printf)
awk '{printf $1"\t"}{printf $1"\t"}{print $1}' file
#Print $1 two times (using printf); also make line 3 as null
awk 'NR==3{print "";next}{printf $1"\t"}{print $1}' file
#Extract lines if column 5 is 'H' (before that print the file name)
awk '{
if (FNR == 1 ) print ">" FILENAME
if ($5 == "H") {
printf $4
}
}
END { printf "\n"}' file
#Complete content of the file is printed (script.awk has {print}). The bash script with code is executed.
awk -f script.awk file
PATTERN
#Names with pattern 'a'. Its case-sensitive.
awk '/a/ {print $0}' file
#Print the lines matching the pattern
awk '/pattern/' file
#Find all line where field 2 starts with the pattern br
awk '$2 ~ /^br/' file
#Extract lines that starts with M, T or null (or the $1 is M, T or null)
awk '{if($0 ~ /^[MT ]/ ) print;}' file
#Print the lines that don't match the pattern
awk '!/pattern/' file #Count and print matched pattern (pattern here is 'a')
awk '/a/{++cnt} END {print "Counts of pattern match = ", cnt}' file
Counts of pattern match = 3
#Prints line with either 1 or both patterns
awk '/tyu|uju/' file
# Print any line where field #4 is equal to "pattern"
awk '$4 == "pattern"'
# Print only those lines where field #4 is not equal to "pattern"
awk '$4 != "pattern"'
awk '!($4 == "pattern")'
#Find pattern common to file1 ($1) and file2 ($1) and print $2 of file1 by the side of file2 content
awk '
BEGIN { FS=OFS="\t" }
NR==FNR { map[$1] = $2; next }
{
for (key in map)
if ($0 ~ key)
$0 = $0 OFS map[key]
print
}
' file1 file2
# Matching a field against a regular expression
awk '$6 ~ /^[a-d]/'
awk '$6 !~ /^[a-d]/'
# Print the line immediately before a regex, but not the line containing the regex
awk '/regex/{print x};{x=$0}'
awk '/regex/{print (NR==1 ? "match on line 1" : x)};{x=$0}'
# Print the line immediately after a regex, but not the line containing the regex
awk '/regex/{getline;print}'
#Pulls out lines between start_pattern and end_pattern
awk '/start_pattern / {flag=1;next} /end_pattern/{flag=0} flag {print}' file
awk '/end_pattern/{flag=0}flag;/start_pattern/{flag=1}' file
awk '/start_pattern/{flag=1;next}/end_pattern/{flag=0}flag' file
sed -e '1,/start_pattern/d' -e '/end_pattern/,$d'
#Pulls out lines between start_pattern and end_pattern, also the lines with these patterns
awk '/start_pattern/ { show=1 } show; /end_pattern/ { show=0 }' file
sed -n '/start_pattern/,/end_pattern/p' file
sed '/start_pattern/,/end_pattern/!d' file
LENGTH
#Print lines having more than 20 characters (all were shown as all have above 20 characters)
awk 'length($0) > 20' file
#Print the number of arguments passed
awk 'BEGIN {print "Arguments =", ARGC}' rain winter autumn
Arguments = 4
#Print the number of arguments passed, with their numbers
awk 'BEGIN{for (i = 0; i < ARGC - 1; ++i){printf "ARGV[%d] = %s\n", i, ARGV[i]}}' rain winter autumn
ARGV[0] = awk
ARGV[1] = rain
ARGV[2] = winter
#Find the length of a string
echo "seema" | awk '{print length}'
#Prints each line and their string length
awk '{ print $0 " = " length($0) }' file
#Prints only lengths (no strings)
awk '{ print length($0) }' file
COLUMNS/FIELDS
#Print all columns
awk '{print $0}' file
#To extract column1 from a multi-column file
awk '{print $1}' file
#To extract column1 from a multi-column file and paste it to another file with column
awk '{print $1}' < file1 | paste file2 - > merged_file
#To print only column 1
awk '{print $1}' multi_column_file > uni_column_file
#Print the 3rd column
awk '{print $3}' file
#print field 1 to 3
awk '{print $1 $3}' file
#print only columns 3 and 4 when column 3 contains "pattern"
awk '$3=="pattern"{ print $3, $4}' file
#print the line when column 3 contains "pattern"
awk '$3=="pattern"' file
# Print the last column of each line
awk '{ print $NF }'
#Print the 1st and the 3rd columns
awk '{print $1 $3}' file
#To print all column except column 1
awk '{for (i=2; i<NF; i++) printf $i " "; print $NF}' file
#To extract field 3 (the columns must be comma separated)
awk -F, '{$0=$3}1' file
#To extract column 2 from comma separated file
awk -F "," '{print $2}' file
#To extract field 3 (the columns must be colon separated)
awk -F ":" '{print $3}' file
#To extract field 3 (the columns must be space separated)
awk -F " " '{print $3}' file
#To extract all columns except 3
awk '{$3=""; print $0}' file
#To extract  all other columns but not the 1st and the 2nd.
awk '{$1= $2= ""; print $0}' file
#To extract penultimate and last columns (very important)
awk '{print $(NF-1)" "$NF}' file
awk '{print $NF}' file #extract last column
awk '{print $(NF-1)}' file #the 2nd last column
awk ' {print ($(NF-1)) }' file
awk '{print $1, $2, $3, $4, $NF}' file # extract 1,2,3,4 and last field
#Adding a comma between the columns can give the required space between them
seq 10 | xargs -n5 | awk '{NF--; print $NF}' #create a seq of 10 numbers, make group of 5, extract 2nd last column
#To remove last 3 columns (from csv file, eith comma separator)
awk -F , -v OFS=, '{ for (i=1;i<=NF-3;i++){ printf $i, }; printf "\n"}' file
#To print column 1, and 3 in tabs for the line with the given pattern (suppose 'qwe' is the pattern)
awk '/qwe/ {print $1,"\t", $2, "\t", $3;}' file
#Print all but very first or the first 2 column
cat somefile | awk '{$1=""; print $0}' file
cat some file | awk '{$1=$2=""; print $0}' file
#Extract field 3 and field 4, with space in between
awk '{ print $3 " " $4 }' file
#Print all but two last columns
awk '{$(NF-1)=$NF=""; print $0}' file
#To print all fields for lines with the pattern
awk '/qwe/ {print $0;}' file
#To print column in the given order for the lines with the pattern
awk '/qwe/ {print $3,"\t", $2, "\t", $1 ;}' file
#For the first line with pattern, print column 1 and 3; for rest of the patterned line all columns
awk '/qwe/ {FS="\t"; print $1,"\t",$3;}' file
#Delete a specific column or multiple columns
awk '!($3="")' file #delete 3rd column
awk '{ $3=$4=$5=$6=""; print $0 }' file #delete 3rd, 4th, 5th and 6th column
#Print column in any order
awk '/n/ {print $4 "\t" $3}' file
#Print only 3rd and 4th field
awk '{print $3 "\t" $4}' file
#Extract field 3 and 4, where names have pattern 't'. Its case-sensitive.
awk '/t/ {print $3 "\t" $4}' file
# Print the longest string in field 1
awk '$1 > max {max=$1; maxline=$0}; END{ print max, maxline}'
# Print the number of fields in each line, followed by the line
awk '{ print NF ":" $0 } '
#For the first line with pattern, print column 1 and 4, with comma between, for rest of the patterned line all columns
awk '/qwe/ {FS="\t"; OFS=","; print $1,$4;}' file
#Print the last field of each line of the file
awk '{ print $NF }' file
#Extracts columns those satisfy the conditions
awk '{if ($2 >=0 && $2 <=2) {print $2,$3}}' file
#Print a range of columns from the 2nd till the 4th
awk -v f=2 -v t=4 '{for(i=f;i<=t;i++) printf("%s%s",$i,(i==t)?"\n":OFS)}' file
#Exclude a column range from the 2nd till the 4th and print the rest of the columns
awk -v f=2 -v t=4 '{for(i=1;i<=NF;i++)if(i>=f&&i<=t)continue;else printf("%s%s",$i,(i!=NF)?OFS::ORS)}' file
#If the 2nd field starts with a number, print field1
awk '$2 ~ /^[0-9]/ { print $1 }' file
#Read the line with fields separated by colon. Then extract field 2
echo "154:266:377:454:533" | awk -F: '{print $2}'
#This code will take column1, sort it, find unique positions and count frequency
awk -F '\t' '{print $1}' file | sort | uniq -c | sort -nr
#Prints field 1 when fields are separated by comma (,). If not separated by , sign all fields are printed
awk 'BEGIN { FS = "," } ; { print $1 }' file
#Print column2, pipe it to another awk to eliminate patterns not required, output the result to a new file
awk '{print $2}' path_to_file | awk '!/hypothetical/' > new_file
#Prints where fields are separated by colons
awk -F':' '{print $3,$4;}' file
#Print columns from the 2nd till the 4th (column should be ab separated)
awk -v f=2 -v t=4 '{for(i=f;i<=t;i++) printf("%s%s",$i,(i==t)?"\n":OFS)}' file
#Ignore columns 2nd to 4th, print rest of them
awk -v f=2 -v t=4 '{for(i=1;i<=NF;i++)if(i>=f&&i<=t)continue;else printf("%s%s",$i,(i!=NF)?OFS:ORS)}' file
#Print columns separated by comma
awk '{print $1","$2;}' file
awk '{print $1","$NF;}' file
#Print header and sequence as legend; the columns separated by comma; the ending line
awk 'BEGIN{print "Header,Sequence"}{print $1","$2;}END{print "****"}' file
#print columns separated by tab (change output field separator (OFS) to tabular "\t")

awk -F"\t" '$3=="pattern"{OFS = "\t"; print $3, $4}' file
#Count number of instances in field 1
awk '{h[$1]++}; END { for(k in h) print k, h[k] }' file
# Print the first 2 fields, in opposite order, of every line
awk '{print $2, $1}' file
# Switch the first 2 fields of every line
awk '{temp = $1; $1 = $2; $2 = temp}' file
#switch column 3 and 4
awk '{tmp=$3; $3=$4; $4=tmp}; {print }' file
#switch only in lines where field 3 is larger than field 4
awk '$3>$4{tmp = $3; $3=$4; $4=tmp;}; {print }' file
#print columns in any specific order
awk '{ print $2, $1, $4, $3 }' file
#AND (&&) and OR (||) operator
awk '$3=="pattern1" && $4>20 { print $3, $4}' file
awk '$3=="pattern1" || $3=="pattern1" { print $3, $4}' file
ROWS/LINES
#Prints all entries in file
awk '{f=1} f; /pattern/{f=0}' file
#Extract row (record) 1
awk 'NR == 1' file #Print the line 50
awk 'NR==50 {print;exit}' file
#Print the lines 20 through 35
awk 'NR==20,NR==35' file
#Print the first 3 lines of a file
awk 'NR <= 3' file
#Print the last line of a file
awk 'END{print}' file
#To extract row 5 from files with starting pattern 'isolates' (using a for loop)
for file in /home/seema/data_analysis/isolates*
do
      awk 'NR == 5' $file
done > output_file
#Extracts only unique rows of a file
awk '!NF || !seen[$0]++' file
# Print first 10 lines of file
awk 'NR < 11'
#To extract rows from 10000 to 100000
awk 'NR > 100000 { exit } NR >= 10000 && NR <= 100000' ile
# Print first line of file
awk 'NR>1{exit};1'
#Prints only row 1 and exits
awk '{ OFS="::" ; print $1, $2 ; exit }' file
#Prints all rows
awk '{ OFS="::" ; print $1, $2}' file
#Print the next line after the given pattern
awk 'f{print;f=0} /pattern/{f=1}' file
awk 'c&&!--c; /pattern/{c=1}' file
#Print all the lines after that pattern
awk 'f;/pattern/{f=1}' file
#Print row with the pattern and all lines after it
awk '/pattern/{f=1}f' file
#Extracts everything from given pattern1 to given pattern2, including them
awk '/pattern1/{f=1} f; /pattern2/{f=0}' file
#Extracts everything from given pattern1 to given pattern2, but not including the pattern1
awk '/pattern1/{f=1} /pattern2/{f=0}' file
#Extracts everything from given pattern1 to given pattern2, but not including the pattern2
awk '/pattern1/{f=1} /pattern2/{f=0} f' file
#Extracts everything from given pattern1 to given pattern2, but not including any of them
awk '/pattern1/{f=0} f; /pattern2/{f=1}' file
#Print all lines in the file
awk '{print $0}' file
#Find row with this pattern 'gh'
awk '/gh/' file
#Find and print all the lines in a file, that match multiple patterns grep -E 'pattern1.*pattern2' file.txt # in that order
awk '/tyu.*uju/' file # in that order
awk '/tyu/ && /uju/' file # in any order
#Find and print all the lines, that do not match a pattern
awk '!/tyu/' file
# Print the total number of lines that contain a pattern
awk '/pattern/{n++}; END {print n+0}' file
# Print every line with more than 4 fields
awk 'NF > 4'
# Print every line where the value of the last field is > 4
awk '$NF > 4'
# Print the last line of a file
awk 'END{print}'
SUBSTITUTE (Replace, change, swap)
#Substitute the prefix pattern with null in row 4th
awk 'NR==4{sub(/^\pattern/,"")}{print}' file
#Swaps the specified 2 fields
awk '{tmp = $1; $1 = $2; $2 = tmp; print}' file
#Find/replace dog, cat or bird with pet and print
awk '{gsub(/dog|cat|bird,"pet");print}' file
#Find/replace dog with cat in every file
awk '{gsub("dog", "cat", $0); print > FILENAME}' file
DELETE/REMOVE
#Deletes all blank lines in the file
awk 'NF' file
awk '/./' file
#Deletes 2nd row in the file
awk 'NR!=2'
#Delete the trailing whitespace from end of each line
awk '{sub(/[ \t]+$/, "");print}' file
# Removing lines containing a unique first field (keep only lines with duplicate first fields)
awk 'FNR==NR{a[$1]++;next}(a[$1] > 1)'./file ./file

# Removes all lines where field 2 has same value
awk -F, '
!seen[$2]++ {
line[$2] = $0
}
END {
for(val in seen)
if(seen[val]==1)
print line[val]

}' file
#Delete leading and trailing whitespace from each line
awk '{gsub(/^[ \t]+|[ \t]+$/,"");print}' file
#Remove the 3rd field in each line and then print it
awk '{ $3 = ""; print }' file
#Delete the leading whitespace (spaces or tabs) from front
awk '{sub(/^[ \t]+/, ""); print}' file
#Delete 5 lines after a pattern (including the line with the pattern)
sed -e '/pattern/,+5d' file
awk '/pattern/ {i=5; next} {if (i>0) i--; else print}' file
#Remove numbers from specific columns but not text
awk '{gsub("[0-9]","",$2);gsub("[0-9]","",$3)}1' file
#Remove all empty lines
awk 'a !~ $0{print}; {a=$0}' file
#Deletes leading spaces of a file
awk '{$1=$1}1' file
# Remove duplicate, consecutive lines (emulates "uniq")
awk 'a !~ $0; {a=$0}'
# Remove duplicate, nonconsecutive lines
awk '!a[$0]++' # most concise script
awk '!($0 in a){a[$0];print}' # most efficient script
#Remove last characters of all fields
awk '{gsub(/\./,"");print $0}' file
#Remove last characters of field 5
awk '{gsub(/\./,"");print $5}' file
#Make everything after symbol ';' into null
cat file | awk '{sub(/;.*/,""); print}'
cut -d\; -f1 file
INSERTING (add)
#Add characters at the beginning and at the end of each line
awk '{print "s"$0"p"}' file
#Add commas and print in one line (no comma, comma)
echo $(awk 'NR > 1{print line" "}{line=$0;}END{print $0" "}' file)
echo $(awk 'NR > 1{print line", "}{line=$0;}END{print $0" "}' file)
#Add characters at the beginning of each line
awk '{print "seema"$0}' file
#Add characters at the end of each line
awk '{print $0"seema"}' file
#Add a semicolon to the end of each line
awk '{print $0";"}' file
awk '{print $0"!"}' file
awk '{print $0"."}' file
#Insert 5-6 blank spaces at beginning of each line
awk '{sub(/^/, " ")};1' file
awk '{sub(/^/, "      ");print}' file
#Add some characters at the beginning of matching lines.
awk '/pattern_that_matches/{sub(/^/, "word_to_add "); print;next;}{print}' file
#Insert 20 spaces after column #3 of each input line
gawk --re-interval 'BEGIN{while(a++<20)s=s " "};{sub(/^.{3}/,"&" s)};1'
#adds text and print
awk '{print "Location: "$3"\t Name: "$1" "$2}' file
awk '{printf "Location:%s\tName: %s %s \n", $3, $1, $2}' file

#checks condition, adds text and prints
awk '{if ( $3 == "pattern") print "Location: "$3"\tName: "$1" "$2}' file
awk ' $3 == "pattern" {print "Location: "$3"\t Name: "$1" "$2}' file

NUMBERS (calculate, sum)
# Print line numbers using a tab instead of a space
awk '{print FNR "\t" $0}' file
#Numbers the lines in some ﬁle
awk '{ print NR ". " $0 }' file
#Count of unique lines based on first field in file
awk '!h[$1] { g[$1]=$0 } { h[$1]++ } END { for(k in g) print h[k], g[k] }'
#Count the lines in a file. Just like "wc -l".
awk 'END{print NR}' file
#Total the number of lines that contain the word
awk '/pattern/{n++}; END {print n+0}' file
#Calculates sum of one column (here, column 2 of the file)
awk '{sum+=$2} END {print sum}' file
#Add up the numbers of 2nd field, print the total (column should be tab-separated)
awk '{ sum += $2 } END { print sum }' file
#Print the total number of lines that have the name pattern
awk '/pattern/{n++}; END {print n+0}' file
#Print only lines of less than 20 characters (it makes roughly 3 words)
awk 'length < 20' file
#Precede each line with number for the line
awk '{print FNR "\t" $0}' file
#Count lines (emulates "wc -l")
awk 'END{print NR}' file
#Print the sums of the fields of every line
awk '{s=0; for (i=1; i<=NF; i++) s=s+$i; print s}' file
#Space taken by the JPG files (field 5 has the values)
ls -l *.jpg | awk '{total+=$5} END {print "JPG files in total: "total}'
#find smallest number in last column (approach it by printing fields in reverse order) (here filed 2)
#Once you reverse the order, last field becomes first, which can be easily manipulated
awk '{for (i=NF;i>0;i--){printf $i" "};printf "\n"}' file > file1
awk 'NR == 1 {line = $0; min = $2} NR > 1 && $2 < min {line = $0; min = $2} END{print line}' file1 > file2
#Age calculation ($4 is age of each entry)
awk '{ tot_age = tot_age + $4 }{ aver=tot_age/NR}{ print $0 " cumulative age sum:" tot_age " average:" aver}' file
#Print sum of rows
awk '
BEGIN {FS="\t"}
{
sum=0; n=0
for(i=2; i<=NF;i++)
{sum+=$i; ++n}
print $0"\t"sum
# (if you want to print number of rows or average )
# (print $0"\t""sum:"sum"\t""count:"n""\t""avg:"sum/n)
}' $1
Use: sh script.sh file
# Create a string of a specific length (e.g., generate 200 spaces)

awk 'BEGIN{while (a++<200) s=s " "; print s}'
SPACING
#Double space a file (two ways)
awk '1;{print ""}' file
awk 'BEGIN{ORS="\n\n"};1' file
#Double space a file which already has blank lines in it.
awk 'NF{print $0 "\n"}' file
#Triple space a file
awk '1;{print "\n"}' file
REVERSE
# Print in reverse order the fields of every line
awk '{for (i=NF; i>0; i--) printf("%s ",$i);print ""}' file
MATH
#print lines in which the ratio between columns 2 and 5 is smaller than 0.5
awk '$2/$5<0.5' file
#To print the integers
awk 'BEGIN{
print int(3.534);
print int(4);
print int(-5.223);
print int(-5);
}'
#To print square roots
awk 'BEGIN{
print sqrt(16);
print sqrt(0);
print sqrt(-12);
}'
#To print the log values
awk 'BEGIN{
print log(12);
print log(0);
print log(1);
print log(-1);
}'
#To print the trigonometrical values
awk 'BEGIN {
print sin(90);
print sin(45);
}'
#Random number generation (100 random numbers between 0 to 10 and their frequency )
awk 'BEGIN {
while(i<100)
{
n = int(rand()*10);
rnd[n]++;
i++;
}
for(i=0;i<=10;i++) {
print i,"Occured", rnd[i], "times";
}
}'
#Count number of columns in each line

awk '{ FS = "\t" } ; { print NF}' file
----------------------------------------------------Special jobs-----------------------------
#This code finds the smallest and largest number in column 2 (the columns should be tab-separated)
awk '
function max(x){i=0;for(val in x){if(i<=x[val]){i=x[val];}}return i;}
function min(x){i=max(x);for(val in x){if(i>x[val]){i=x[val];}}return i;}
{a[$2]=$2;next}

END{minimum=min(a);maximum=max(a);print "Maximum = "maximum " and Minimum = "minimum}' file

#Find a program by name from process listing that is not awk and kill it
ps aux | awk '/program_name/ && !/awk/ {print $2}' > kill
#Change to working dir, extract field 1 and field 2 as files, add the pattern (pos_ here) before each line of the new file

cd denovo_analysis/

awk '{print $1}' new_file

awk '{print $2}' new_file

awk '{print $2}' new_file > 2nd_col

sed "s/^/pos_/" 2nd_col

#To extract the lines common to all files (very important)
(If the formatting of this code is wnot working, tweak and check the spaces and modify).
awk 'END {
for (R in rec) {
    n = split(rec[R], t, "/")
    if (n > 1)
      dup[n] = dup[n] ? dup[n] RS sprintf("\t%-20s -->\t%s", rec[R], R) : \
        sprintf("\t%-20s -->\t%s", rec[R], R)
    }
for (D in dup) {
    printf "records found in %d files:\n\n", D
    printf "%s\n\n", dup[D]
    }
}
{
rec[$0] = rec[$0] ? rec[$0] "/" FILENAME : FILENAME
}' file*

#To extract lines only in file 1
awk 'FNR == NR { h[$1,$2]; next }; !($1 SUBSEP $2 in h)' file2 file1

#Join files
FNR == NR {
if ( FNR == 1 ) {
header = $2
next
}
hash[ $1 ] = $2
next
}
FNR < NR {
if ( $1 in hash || FNR == 1 ) {
printf "%s %s\n", $0, ( FNR == 1 ? header : hash[ $1 ] )
}
}
awk -f script.awk input1 input2 | column -t
#Convert every second row into second column
awk '{ if (NR % 2 == 1) tmp=$0; else print tmp, $0; }' file
#Sum values based on duplicate keys
awk ‘
{
A[$1]++
B[$1]+=$2
}
END{
for(i in A) print i, B[i]
}
‘ OFS=’\t’ $1

Use: awk -f script.awk input
----------------
VARIABLES
#Value assignment to variable by -v flag.
awk -v name=Carl 'BEGIN{printf "Name = %s\n", name}'
Name = Carl
#Prints list of global variables in a file awkvars.out.
awk --dump-variables ''
awk --help
#It checks dubious constructs
awk --lint '' /bin/ls
File named awkprof.out is formed with the code block written in formatted form.

awk --profile 'BEGIN{printf"---|Header|--\n"} {print} END{printf"---|Footer|---\n"}' file > /dev/null
awk 'BEGIN {print "FS = " FS}' | cat -vte
FS = $
echo -e "ab bc\nab bc cd \nab bc cd de" | awk 'NF > 2'
ab bc cd
ab bc cd de
echo -e "ab bc\nab bc cd\nab bc cd de" | awk 'NR < 3'
ab bc
ab bc cd
awk 'BEGIN {print "OFS = " OFS}' | cat -vte
OFS = $

awk 'BEGIN {print "ORS = " ORS}' | cat -vte
ORS = $

$
Match (string, regexp). RLENGTH is for length of match.
awk 'BEGIN { if (match("rain snow clouds river tree", "ver")) { print RLENGTH } }'
3
RSTAT finds the first position of matching. e.g. 1 here (r of ros).
awk 'BEGIN { if (match("rose jasmine hibiscus", "ros")) { print RSTART } }'

1
#Print the usernames of the all users on your
system
awk -F: '{print $1}' /etc/ passwd
# Print and sort the login names of all users
awk -F ":" '{print $1 | "sort" }' /etc/passwd
#find difference of the exon starts and ends
mysql -N -h genome-mysql.cse.ucsc.edu -A -u genome -D hg18 -e 'select name,name2,exonStarts,exonEnds from ensGene' |\
awk -F ' ' '{n=split($3,a1,"[,]"); split($4,a2,"[,]"); size=0; for(i=1;i<=n;++i) {size+=int(a2[i]-a1[i]);} printf("%s\t%s\t%d\n",$1,$2,size); }'
#Using strings from several input files as search criteria for select columns in a CSV file
awk -v inc="(^|,)($(tr '\n' '|' <inclusion))(,|$)" -v exc="(^|,)($(tr '\n' '|' <exclusion))(,|$)" 'NR==1 || ($0 ~ inc && ! ($0 ~ exc))' file
# To assign values to each character and to find average
#awk -f scipt.awk file
BEGIN {
FS=""
k["A"]=0.2; k["G"]=0.5; k["L"]=0.14; k["M"]=0.70
k["R"]=0.55; k["C"]=0.48; k["H"]=1.00; k["K"]=0.4
}

/^>/{
$1=""
name=$0
next
}

{
s=0
for (i=1; i<=NF; i++) {
s+=k[$(i)]
}
printf "%s - %.3f\n", name, s/NF
}
#Extract the number of specific characters from a text file (The alphabet A and E here).
awk '
$1 ~ /^>/ {
getline str
num_a = gsub( /A/, "", str )
num_e = gsub( /E/, "", str )
printf "%s\nTotal number of A - %d\nTotal number of E - %d\n\n", $0, num_a, num_e
}
' file
#Calculate the total number of specific residues in protein sequences
sed 's/>/\n/' file |
awk -vRS='' -vOFS='\t' '
BEGIN{
print "ptn","A+L"
}
{
gsub(/[^AL]/,"",$2)
print $1,length($2)
}'
#extracting segments from a file
#file1 has sequence and file2 has positions (e.g. 5 10)
awk 'BEGIN{getline sequence < "first_file"} {print substr(sequence, $1, $2 - $1 + 1) }' second_file
#multiply all numbers in a text file by a constant subject to another constraint
#Here multiplied with 1.2 (20% rise). The max value should not exceed 1
awk '{p=1.2*$0;$0=p>1?1:p;printf "%.5f\n",$0}' file
#remove a character at the end of each line
awk '{print substr($0, 1, length($0)-1)}' file
awk '{gsub(/,$/,""); print}' file
#Print values less than specified number for all fields (start from column 2 and print; if value is more than 1, go to next line)
awk '{for(i=2;i<=NF;i++)if($i>1){next}}1' file

Exploring the choppy water of coding

Saturday, December 12, 2015

Shell (2): awk (1) .......

No comments:

Post a Comment