-
Notifications
You must be signed in to change notification settings - Fork 0
/
calculate-evaluate_ENCODE-complexity-metrics.sh
301 lines (257 loc) · 9.06 KB
/
calculate-evaluate_ENCODE-complexity-metrics.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
#!/bin/bash
# calculate-evaluate_ENCODE-complexity-metrics.sh
# KA
debug=true
# Define functions ===========================================================
#TODO Function to print hep information for the script
function show_help() {
cat << EOM
Usage: ${0} [OPTIONS]
This script runs analyses to calculate, evaluate, and return various ENCODE
library complexity metrics: encodeproject.org/data-standards/terms/#library
Options:
-h, --help Display this help message.
-t, --threads Number of threads to use (default: 1).
-b, --bam Path to the BAM file.
-m, --mode Mode to specify chromosome filter: SC (exclude
"SP_"), SP (only "SP_"), or both (all chromosomes) (default:
SC).
-q, --mapq MAPQ threshold for BAM filtering (default: 1).
Dependencies:
samtools Required for processing BAM files.
sort Involved in generating various alignment tallies.
uniq Involved in generating various alignment tallies.
wc Involved in generating various alignment tallies.
awk Used for filtering BAM file text streams and generating various
alignment tallies.
bc Used for evaluating metric values.
Example:
calculate-evaluate_ENCODE-complexity-metrics.sh \\
--bam path/to/bam \\
--threads 4 \\
--mode SC \\
--mapq 1
EOM
}
# Function to check if program is in PATH; if not, print an error message and
#+ exit with code 1
function check_program_in_path() {
local program_name="$1"
if ! command -v "${program_name}" &> /dev/null; then
error_and_exit "${program_name} is not in PATH. Please install ${program_name} or add it to PATH to continue."
fi
}
# Function to exit with exit code 1, which stops the execution of all code,
#+ and return an error message
function error_and_exit() {
echo "Error: ${1}" >&2
exit 1
}
# Function to calculate metric based on user-supplied conditions
function calculate_metric() {
local threads="${1}"
local mapq="${2}"
local bam="${3}"
local condition="${4}"
local count_condition="${5}"
samtools view -@ "${threads}" -f 2 -q "${mapq}" "${bam}" \
| awk "${condition}" \
| sort \
| uniq -c \
| awk "${count_condition}"
}
# Function to evaluate NRF or PBC1/2 metrics
function evaluate_metric() {
local metric="${1}" # The metric to evaluate (NRF, PBC1, or PBC2)
local metric_value="${2}" # The value of the metric
# Below, use 'case' with 'bc' for floating-point comparisons to categorize
#+ the metric value. 'bc -l' is a calculator that supports floating-point
#+ arithmetic. Each 'case' option uses 'bc' to evaluate if the metric_value
#+ meets a specific condition and echoes the corresponding evaluation
#+ (e.g., "severe", "moderate", "mild", or "none")
if [[ "${metric}" == "NRF" ]]; then
# shellcheck disable=SC2194
# shellcheck disable=SC2254
case 1 in
$(
echo "${metric_value} < 0.5" | bc -l
)) echo "concerning" ;;
$(
echo "${metric_value} >= 0.5 && ${metric_value} < 0.8" | bc -l
)) echo "acceptable" ;;
$(
echo "${metric_value} >= 0.8 && ${metric_value} < 0.9" | bc -l
)) echo "compliant" ;;
$(
echo "${metric_value} >= 0.9" | bc -l
)) echo "ideal" ;;
esac
elif [[ "${metric}" == "PBC1" ]]; then
# For PBC1, different evaluation terms are returned
# shellcheck disable=SC2194
# shellcheck disable=SC2254
case 1 in
$(
echo "${metric_value} < 0.5" | bc -l
)) echo "severe" ;;
$(
echo "${metric_value} >= 0.5 && ${metric_value} < 0.8" | bc -l
)) echo "moderate" ;;
$(
echo "${metric_value} >= 0.8 && ${metric_value} < 0.9" | bc -l
)) echo "mild" ;;
$(
echo "${metric_value} >= 0.9" | bc -l
)) echo "none" ;;
esac
elif [[ "${metric}" == "PBC2" ]]; then
# For PBC2, different thresholds are applied and different evaluation
#+ terms are returned
# shellcheck disable=SC2194
# shellcheck disable=SC2254
case 1 in
$(
echo "${metric_value} < 1" | bc -l
)) echo "severe" ;;
$(
echo "${metric_value} >= 1 && ${metric_value} < 3" | bc -l
)) echo "moderate" ;;
$(
echo "${metric_value} >= 3 && ${metric_value} < 10" | bc -l
)) echo "mild" ;;
$(
echo "${metric_value} >= 10" | bc -l
)) echo "none" ;;
esac
else
error_and_exit "Invalid positional argument: ${metric}. Argument must be 'NRF', 'PBC1', or 'PBC2'."
fi
}
# Something ==================================================================
check_program_in_path "samtools"
if [[ -z "${1}" || "${1}" == "-h" || "${1}" == "--help" ]]; then
show_help
exit 0
fi
if ${debug}; then
# Values for debugging
threads=1
bam="03_bam/bowtie2/bam/in_G1_Hho1_6336.sort-coord.bam"
mode="SC"
mapq=1
echo "
threads=${threads}
bam=${bam}
mode=${mode}
mapq=${mapq}
"
else
# Default values
threads=1
bam=""
mode="SC"
mapq=1
fi
# Parse command line arguments
while [[ "$#" -gt 0 ]]; do
case "${1}" in
-t|--threads) threads="${2}"; shift 2 ;;
-b|--bam) bam="${2}"; shift 2 ;;
-m|--mode) mode="${2}"; shift 2 ;;
-q|--mapq) mapq="${2}"; shift 2 ;;
*) echo "Unknown parameter passed: ${1}"; exit 1 ;;
esac
done
if [[ -z "${bam}" ]]; then
error_and_exit "BAM file path is required. Use -b or --bam to specify it."
fi
if [[ ! -f "${bam}" ]]; then
error_and_exit "Specified BAM file does not exist."
fi
# Check mode and set awk command accordingly
# shellcheck disable=SC2016
case "${mode}" in
SP) awk_cmd='{ if ($3 ~ /^SP_/) print $3, $4 }' ;;
SC) awk_cmd='{ if ($3 !~ /^SP_/) print $3, $4 }' ;;
both) awk_cmd='{ print $3, $4 }' ;;
*) error_and_exit "Invalid mode: ${mode}. Mode must be 'SC', 'SP', or 'both'." ;;
esac
# Determine M_1, the tally of genomic locations where exactly one read maps
#+ uniquely
# shellcheck disable=SC2016
M_1=$(
calculate_metric \
"${threads}" "${mapq}" "${bam}" \
"${awk_cmd}" '$1 == 1 { count++ } END { print count }'
)
# Determine M_2, the tally of genomic locations where exactly two reads map
#+ uniquely
# shellcheck disable=SC2016
M_2=$(
calculate_metric \
"${threads}" "${mapq}" "${bam}" \
"${awk_cmd}" '$1 == 2 { count++ } END { print count }'
)
# Determine M_DISTINCT, the count of the number of distinct genomic locations
#+ to which some read maps uniquely
# shellcheck disable=SC2016
M_distinct=$(
calculate_metric \
"${threads}" "${mapq}" "${bam}" \
"${awk_cmd}" '{ print $0 }' \
| wc -l
)
# Calculate the ENCODE non-redundant fraction (NRF), the number of distinct
#+ alignments (after removing duplicates) divided by the total number of
#+ alignments
total="$(samtools view -@ "${threads}" -c -f 2 -q "${mapq}" "${bam}")"
dup="$(samtools view -@ "${threads}" -c -f 1024 -f 2 -q "${mapq}" "${bam}")"
nondup="$(( total - dup ))"
NRF="$(echo "scale=9; ${nondup} / ${total}" | bc)"
# Calculate the ENCODE PCR bottleknecking coefficient #1 (PBC1):
#+ M_1 / M_DISTINCT
PBC1="$(echo "scale=9; ${M_1} / ${M_distinct}" | bc)"
# Calculate the ENCODE PCR bottleknecking coefficient #2 (PBC2):
#+ M_1 / M_2
PBC2="$(echo "scale=9; ${M_1} / ${M_2}" | bc)"
# Compare and record ENCODE QC evaluation based on the value of NRF
eval_NRF="$(evaluate_metric "NRF" "${NRF}")"
# Compare and record ENCODE QC evaluation based on the value of PBC1
eval_PBC1="$(evaluate_metric "PBC1" "${PBC1}")"
# Compare and record ENCODE QC evaluation based on the value of PBC2
eval_PBC2="$(evaluate_metric "PBC2" "${PBC2}")"
if ${debug}; then
echo "
bam=${bam}
total=${total}
dup=${dup}
nondup=${nondup}
M_1=${M_1}
M_2=${M_2}
M_distinct=${M_distinct}
NRF=${NRF}
PBC1=${PBC1}
PBC2=${PBC2}
eval_NRF=${eval_NRF}
eval_PBC1=${eval_PBC1}
eval_PBC2=${eval_PBC2}
"
else
echo "bam"$'\t'"total"$'\t'"dup"$'\t'"nondup"$'\t'"M_1"$'\t'"M_2"$'\t'"M_distinct"$'\t'"NRF"$'\t'"PBC1"$'\t'"PBC2"$'\t'"eval_NRF"$'\t'"eval_PBC1"$'\t'"eval_PBC2"
echo "$(basename "${bam}")"$'\t'"${total}"$'\t'"${dup}"$'\t'"${nondup}"$'\t'"${M_1}"$'\t'"${M_2}"$'\t'"${M_distinct}"$'\t'"${NRF}"$'\t'"${PBC1}"$'\t'"${PBC2}"$'\t'"${eval_NRF}"$'\t'"${eval_PBC1}"$'\t'"${eval_PBC2}"
fi
# SC mode
#
# M_1_1 2902897
# M_2_1 2189664
# M_distinct_1 8747978
# NRF_1 .685554002 acceptable
# PBC1_1 .331836339 severe
# PBC2_1 1.325727143 moderate
#
# M_1_40 2725652
# M_2_40 2020843
# M_distinct_40 8068906
# NRF_40 .677960672 acceptable
# PBC1_40 .337796970 severe
# PBC2_40 1.348769795 moderate