From db2c07aaeaa614525aed055dd2316f72861b7aba Mon Sep 17 00:00:00 2001
From: Stephen Beckstrom-Sternberg <Stephen.Beckstrom-Sternberg@azdhs.gov>
Date: Wed, 19 Jul 2023 17:56:35 -0700
Subject: [PATCH] Create HeatCluster-0.4.0.py

Automatically uses the appropriate input snp matrix. Shortens/simplifies long sample names.
---
 bin/HeatCluster-0.4.0.py | 133 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 133 insertions(+)
 create mode 100644 bin/HeatCluster-0.4.0.py

diff --git a/bin/HeatCluster-0.4.0.py b/bin/HeatCluster-0.4.0.py
new file mode 100644
index 0000000..2a15178
--- /dev/null
+++ b/bin/HeatCluster-0.4.0.py
@@ -0,0 +1,133 @@
+#!/bin/python3
+import re
+import numpy as np
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+import scipy.cluster.hierarchy as sch
+from sklearn.cluster import KMeans
+from sklearn.metrics import silhouette_samples
+from pathlib import Path
+from io import StringIO  # Import StringIO directly from the 'io' module
+
+try:
+    path = Path('./snp-dists.txt')
+    path.resolve(strict=True)
+except FileNotFoundError:
+    path = Path('./snp_matrix.txt')
+
+print("Using file path:", path)
+
+# Helper function to perform replacements using regular expressions
+def replace_with_regex(text):
+    return re.sub(
+        r"\.consensus_threshold_\d+\.\d+_quality_\d+",
+        '',
+        text.replace(",", "\t")
+            .replace('snp-dists 0.8.2\t', '')
+            .replace('_contigs', '')
+            .replace('_genomic', '')
+            .replace("Consensus_", '')
+    )
+
+# Read the entire content of the file and perform replacements
+with open(path, 'r') as file:
+    content = file.read()
+    content = replace_with_regex(content)
+
+# Create a DataFrame from the modified content
+df = pd.read_csv(StringIO(content), sep='\t', index_col=0)  # Use StringIO directly
+
+# Handle any potential non-finite values in the DataFrame
+df = df.replace([np.inf, -np.inf], np.nan).dropna(inplace=False)
+
+numSamples = len(df.columns)
+
+print("Found", numSamples, "samples in snp_matrix.txt")
+
+if numSamples <= 2:
+    print("This matrix has too few samples or has been melted. Sorry!")
+    exit(0)
+else:
+    
+# Calculate the appropriate figure size based on the number of samples
+
+    if numSamples <= 20:
+        figureSize = (10, 8)
+    else:
+        figureSize = (round(numSamples / 2), round(numSamples / 2.5))
+        print("\n\nNumber of samples:", numSamples, "\nFigure size:", figureSize)
+
+# Now, let's proceed to compute clusters
+clusters = sch.linkage(df.values, method='complete', metric='euclidean')
+
+# Create clustermap and get the order of rows and columns based on clustering
+clustergrid = sns.clustermap(
+    df, xticklabels=True, yticklabels=True, vmin=0, vmax=50, center=20,
+    annot=True, annot_kws={'size': 6}, cbar_kws={"orientation": "vertical", "pad": 0.5},
+    cmap='Reds_r', linecolor="white", linewidths=.01, fmt='d', dendrogram_ratio=0.1,
+    col_cluster=True, row_cluster=True, figsize=figureSize
+)
+plt.setp(clustergrid.ax_heatmap.get_xticklabels(), rotation=45, ha='right')
+
+# Suppress printing of dendrogram along the y-axis
+clustergrid.ax_row_dendrogram.set_visible(False)
+clustergrid.ax_col_dendrogram.set_visible(True)
+row_order = clustergrid.dendrogram_row.reordered_ind
+col_order = row_order
+
+# Sort the DataFrame based on the cluster order
+sorted_df = df.iloc[row_order, col_order]
+
+# Compute the number of SNPs within the cluster per row
+within_cluster_snps = sorted_df.apply(lambda row: row[row < 500].sum(), axis=1)
+
+# Add 'Within_Cluster_SNPs' column to the sorted DataFrame
+sorted_df['Within_Cluster_SNPs'] = within_cluster_snps.values
+
+# Calculate silhouette scores for different numbers of clusters
+silhouette_scores = []
+upper_range = min(numSamples, 11)
+for n_clusters in range(2, upper_range):
+    kmeans = KMeans(n_clusters=n_clusters, n_init=10)
+    cluster_labels = kmeans.fit_predict(sorted_df.values)
+    silhouette_avg = silhouette_samples(sorted_df.values, cluster_labels).mean()
+    silhouette_scores.append(silhouette_avg)
+
+# Find the optimal number of clusters with the highest silhouette score
+optimal_num_clusters = silhouette_scores.index(max(silhouette_scores)) + 2
+
+# Use the optimal number of clusters to assign cluster labels and sort the DataFrame
+kmeans = KMeans(n_clusters=optimal_num_clusters, n_init=10)
+cluster_labels = kmeans.fit_predict(sorted_df.values)
+sorted_df['Cluster'] = cluster_labels
+
+# Sort the DataFrame first by 'Cluster' and then by 'Within_Cluster_SNPs'
+sorted_df = sorted_df.sort_values(by=['Cluster', 'Within_Cluster_SNPs'], ascending=[True, True], kind="mergesort")
+
+# Drop 'Cluster' and 'Within_Cluster_SNPs' columns
+sorted_df = sorted_df.drop(['Cluster', 'Within_Cluster_SNPs'], axis='columns')
+sorted_df = sorted_df.reindex(columns=sorted_df.index)
+
+# Save the finally sorted, tab-delimited SNP matrix
+sorted_df.to_csv('Final_snp_matrix.tsv', sep='\t', header=True, index=True)
+
+# Create the reordered heatmap with correct values
+heatmap = sns.clustermap(
+    sorted_df, xticklabels=True, yticklabels=True, vmin=0, vmax=50, center=20,
+    annot=True, annot_kws={'size': 6}, cbar_kws={"orientation": "vertical", "pad": 0.5},
+    cmap='Reds_r', linecolor="white", linewidths=.01, fmt='d', dendrogram_ratio=0.15,
+    col_cluster=True, row_cluster=True, figsize=figureSize
+)
+plt.setp(heatmap.ax_heatmap.get_xticklabels(), rotation=45, ha='right')
+heatmap.ax_row_dendrogram.set_visible(False)
+heatmap.ax_col_dendrogram.set_visible(False)
+
+# Save the reordered heatmap as a PDF and PNG
+heatmap.savefig('SNP_matrix.pdf')
+heatmap.savefig('SNP_matrix.png')
+heatmap.savefig('SNP_matrix_mqc.png')
+plt.show()
+plt.close()
+
+print("Saved heatmap as SNP_matrix.{pdf,png}")