import numpy as np
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from tqdm import tqdm_notebook
import IPython.display as display
import textwrap
import warnings
# Suppress all warnings
warnings.filterwarnings("ignore")
%matplotlib inline
# Load data
afr = "pca-afr.eigenvec-sorted.xlsx"
gmp_a = pd.read_excel(afr, sheet_name="pca-data")
gmp = gmp_a
eth_order = ["GMB","GWD","SEN","MAL","MSL","GUI","BKF","CIV","GHA","BEN","YRI","ESN","BER","CAM","DRC","UGN","LWK","ZAM","MLW","BWN","RSA"]
gmp.head()
| FID | IID | PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | PC7 | PC8 | PC9 | PC10 | ETH | Population | CLUSTERa | CLUSTERb | CLUSTER | POP | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | SC_GMFUL5306349 | SC_GMFUL5306349 | 0.024255 | -0.006405 | 0.020890 | -0.009007 | 0.010061 | -0.004022 | -0.007109 | -0.007006 | -0.012017 | 0.005776 | GMB | Gambia (GGVP; N=253) | CLUSTER1 (N=402) | CLUSTER1 | CLUSTER1 (N=350) | AFR |
| 1 | SC_GMFUL5306352 | SC_GMFUL5306352 | 0.014896 | 0.011660 | 0.001984 | -0.004511 | 0.024223 | -0.016878 | -0.017941 | 0.002425 | -0.022156 | -0.001535 | GMB | Gambia (GGVP; N=253) | CLUSTER1 (N=402) | CLUSTER1 | CLUSTER1 (N=350) | AFR |
| 2 | SC_GMFUL5306355 | SC_GMFUL5306355 | 0.028940 | -0.008987 | 0.017486 | -0.009400 | 0.006759 | -0.006706 | -0.003040 | 0.009046 | -0.006762 | -0.000610 | GMB | Gambia (GGVP; N=253) | CLUSTER1 (N=402) | CLUSTER1 | CLUSTER1 (N=350) | AFR |
| 3 | SC_GMFUL5306360 | SC_GMFUL5306360 | 0.020439 | -0.000084 | 0.022278 | -0.012062 | 0.020236 | -0.012620 | -0.013186 | 0.003586 | -0.010011 | 0.008572 | GMB | Gambia (GGVP; N=253) | CLUSTER1 (N=402) | CLUSTER1 | CLUSTER1 (N=350) | AFR |
| 4 | SC_GMFUL5306371 | SC_GMFUL5306371 | 0.023628 | 0.006352 | -0.004576 | 0.002971 | 0.016885 | -0.004671 | -0.014428 | -0.014132 | -0.016898 | -0.002277 | GMB | Gambia (GGVP; N=253) | CLUSTER1 (N=402) | CLUSTER1 | CLUSTER1 (N=350) | AFR |
# Plot PCA and kernel densities
fig, (axs1, axs2) = plt.subplots(2,1, figsize=(11,15))
cnt = sns.kdeplot(data=gmp, hue="CLUSTER", x="PC1", y="PC2", thresh=0.05, levels=4, color=".2", alpha=0.5, fill=False, ax=axs1)
sct = sns.scatterplot(data=gmp, hue="Population", x="PC1", y="PC2", ax=axs1)
kde = sns.kdeplot(data=gmp, hue="CLUSTER", x="PC1", y="PC2", thresh=0.05, fill=True, ax=axs2)
plt.subplots_adjust(right=0.75, bottom=0.2)
sns.move_legend(sct, "upper left", bbox_to_anchor=(1, 1), fontsize=8.5, title="Population")
sns.move_legend(kde, "upper left", bbox_to_anchor=(1, 1), fontsize=13, title="Cluster")
axs1.text(-0.05, 0.05, "A", fontsize=18, color='black')
axs2.text(-0.05, 0.05, "B", fontsize=18, color='black')
plt.savefig('figs/Figure1.pdf', format='pdf', dpi=600)
plt.show()
plt.close()
# Plot MAFs
wgs_clst1_pops = pd.read_excel(
'Source_Data.xlsx',
sheet_name="AF-merged-pops-cat1"
)
wgs_clst2_pops = pd.read_excel(
'Source_Data.xlsx',
sheet_name="AF-merged-pops-cat2"
)
wgs_clst3_pops = pd.read_excel(
'Source_Data.xlsx',
sheet_name="AF-merged-pops-cat3"
)
plt_namea = 'figs/' + "wgs_clst_pops_cat1.png"
plt_nameb = 'figs/' + "wgs_clst_pops_cat2.png"
plt_namec = 'figs/' + "wgs_clst_pops_cat3.png"
data1 = wgs_clst1_pops
df1 = data1.set_index('SNP')
data2 = wgs_clst2_pops
df2 = data2.set_index('SNP')
data3 = wgs_clst3_pops
df3 = data3.set_index('SNP')
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2)
fig.delaxes(ax4)
plt1 = df1.plot(
ax=ax1,
kind="bar",
width=0.7,
alpha=0.90,
in_layout=True,
figsize = (15, 10),
subplots = False,
layout = (2,2),
style="--",
legend=False,
rot=60,
fontsize=14
)
plt1.grid(True, alpha = 0.2)
plt2 = df2.plot(
ax=ax2,
kind="bar",
width=0.7,
alpha=0.90,
in_layout=True,
figsize = (15, 10),
subplots = False,
layout = (2,2),
style="--",
legend=False,
rot=60,
fontsize=14
)
plt2.grid(True, alpha = 0.2)
plt3 = df3.plot(
ax=ax3,
kind="bar",
width=0.7,
alpha=0.90,
in_layout=True,
figsize = (15, 10),
subplots = False,
layout = (2,2),
style="--",
legend=False,
rot=60,
fontsize=14
)
plt3.grid(True, alpha = 0.2)
# Since all plots have same legend, use that for plt1
handles1, labels1 = ax1.get_legend_handles_labels()
all_handles = handles1
all_labels = labels1
# Create a single legend for the entire figure
fig.legend(all_handles, all_labels, loc='lower right', bbox_to_anchor=(0.9, 0.15), ncol=1, fontsize=18)
plt.subplots_adjust(right=0.90, left=0.80, bottom=0, top=0.9)
fig.tight_layout()
plt.show()
plt.close()
wgs_clst1_clusters = pd.read_excel(
'Source_Data.xlsx',
sheet_name="AF-merged-clusters-cat1"
)
wgs_clst2_clusters = pd.read_excel(
'Source_Data.xlsx',
sheet_name="AF-merged-clusters-cat2"
)
wgs_clst3_clusters = pd.read_excel(
'Source_Data.xlsx',
sheet_name="AF-merged-clusters-cat3"
)
plt_name = 'figs/' + "Figure2.pdf"
data1 = wgs_clst1_clusters
df1 = data1.set_index('SNP')
data2 = wgs_clst2_clusters
df2 = data2.set_index('SNP')
data3 = wgs_clst3_clusters
df3 = data3.set_index('SNP')
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2)
fig.delaxes(ax4)
plt1 = df1.plot(
ax=ax1,
kind="bar",
width=0.7,
alpha=0.80,
in_layout=True,
figsize = (15, 10),
subplots = False,
layout = (2,2),
style="--",
legend=False,
rot=60,
fontsize=14
)
plt1.grid(True, alpha = 0.2)
plt1.text(0, 0.14, 'A', fontsize=18, color='black')
ax1.set_xlabel("")
ax1.set_ylabel("Allele frequency", fontsize=14)
plt2 = df2.plot(
ax=ax2,
kind="bar",
width=0.7,
alpha=0.80,
in_layout=True,
figsize = (15, 10),
subplots = False,
layout = (2,2),
style="--",
legend=False,
rot=60,
fontsize=14
)
plt2.grid(True, alpha = 0.2)
plt2.text(0.1, 0.22, 'B', fontsize=18, color='black')
ax2.set_xlabel("SNP", fontsize=14)
plt3 = df3.plot(
ax=ax3,
kind="bar",
width=0.7,
alpha=0.80,
in_layout=True,
figsize = (15, 10),
subplots = False,
layout = (2,2),
style="--",
legend=False,
rot=60,
fontsize=14
)
plt3.grid(True, alpha = 0.2)
ax3.set_xlabel("SNP", fontsize=14)
ax3.set_ylabel("Allele frequency", fontsize=14)
plt3.text(0, 0.32, 'C', fontsize=18, color='black')
# Since all plots have same legend, use that for plt1
handles1, labels1 = ax1.get_legend_handles_labels()
all_handles = handles1
all_labels = labels1
# Create a single legend for the entire figure
fig.legend(all_handles, all_labels, loc='lower right', bbox_to_anchor=(0.8, 0.15), ncol=1, fontsize=18)
plt.subplots_adjust(bottom=0.2, top=0.9)
fig.tight_layout()
plt.savefig(plt_name, format='pdf', dpi=600)
plt.show()
plt.close()
# Load data
pgxscan_pain = pd.read_excel("reports/pgxscan-merged-reports-pop.xlsx", sheet_name="pain-diplotype-report")
pgxscan_pain.head()
| KEY | Gene | name | hap1_main | hap2_main | Diplotype_main | Phenotype | ETH | POP | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | C1_cyp2b6 | cyp2b6 | C1 | *1 | *6 | *1/*6 | normal_metabolizer | NONSCD | AFR |
| 1 | C101_cyp2b6 | cyp2b6 | C101 | *1 | *1 | *1/*1 | normal_metabolizer | NONSCD | AFR |
| 2 | C102_cyp2b6 | cyp2b6 | C102 | *1 | *22 | *1/*22 | ultrarapid_metabolizer | NONSCD | AFR |
| 3 | C103_cyp2b6 | cyp2b6 | C103 | *1 | *6 | *1/*6 | normal_metabolizer | NONSCD | AFR |
| 4 | C109_cyp2b6 | cyp2b6 | C109 | *1 | *6 | *1/*6 | normal_metabolizer | NONSCD | AFR |
# list of relevant phenotypes
functional_phenotypes = [
'ultrarapid_metabolizer',
'intermediate_metabolizer',
'poor_function/metabolizer',
'poor_metabolizer',
'poor_function',
'decreased_function',
'increased_function'
]
# function to extract AFR pops from data
def get_afr_data(data):
"""extract AFR populations from data"""
data_afr = data[data['POP'] == "AFR"]
# get uniq gene names from data
genes = data_afr['Gene'].unique()
# initialize and get genes that have at least one relevant phenotype
genes_with_pheno = []
for gene in genes:
df = data_afr[data_afr['Gene'] == gene]
phenotypes = df['Phenotype'].unique()
if any(pheno in phenotypes for pheno in functional_phenotypes):
genes_with_pheno.append(gene)
genes_with_pheno_df = data_afr[data_afr['Gene'].str.contains('|'.join(genes_with_pheno), case=True, na=False)]
return genes_with_pheno_df
pgxscan_afr = get_afr_data(pgxscan_pain)
pgxscan_afr.head()
| KEY | Gene | name | hap1_main | hap2_main | Diplotype_main | Phenotype | ETH | POP | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | C1_cyp2b6 | cyp2b6 | C1 | *1 | *6 | *1/*6 | normal_metabolizer | NONSCD | AFR |
| 1 | C101_cyp2b6 | cyp2b6 | C101 | *1 | *1 | *1/*1 | normal_metabolizer | NONSCD | AFR |
| 2 | C102_cyp2b6 | cyp2b6 | C102 | *1 | *22 | *1/*22 | ultrarapid_metabolizer | NONSCD | AFR |
| 3 | C103_cyp2b6 | cyp2b6 | C103 | *1 | *6 | *1/*6 | normal_metabolizer | NONSCD | AFR |
| 4 | C109_cyp2b6 | cyp2b6 | C109 | *1 | *6 | *1/*6 | normal_metabolizer | NONSCD | AFR |
from itertools import chain
from termcolor import colored
# plot for all datasets, saving the datasets, category and variable in a tuple,
# all placed within a dictionary with name to serve as plot name
datasets = {
('ETH','Phenotype','pgxscan_pain'):pgxscan_pain
}
for item, data in zip(datasets, datasets.values()):
c_name = list(item)[0]
v_name = list(item)[1]
p_name = list(item)[2]
print(
"plotting bar charts " + "for " +
p_name + " using " + c_name +
" (category), " + v_name +
" (plotting variable), " +
p_name + " (output name)..."
)
uniq_genes = list(set(data.Gene))
panel_list = list("ABCD")
fig, axs = plt.subplots(2,2)
axs = axs.flatten()
handles_lables = []
for i, (gene, figpanel) in enumerate(zip(uniq_genes, panel_list)):
gene_df = data[data['Gene'] == gene]
df = pd.crosstab(gene_df[c_name], gene_df[v_name])
df_perc = df.iloc[:, :].apply(lambda x: x / x.sum(), axis=1) * 100
df_perc.plot(
ax=axs[i],
kind="barh",
stacked=True,
figsize=(12,11),
title=gene,
legend=None,
sharex=False,
sharey=True
)
axs[i].grid(True, alpha = 0.2)
axs[i].set_title(gene.upper(), style='italic')
# ADD LEGEND
nvars = len(gene_df[v_name].unique())
if nvars <= 8:
ncols = 2
elif nvars > 8 and nvars <= 12:
ncols = 3
else:
ncols = 4
axs[i].legend(loc='upper center', bbox_to_anchor=(0.5, -0.06), ncol=ncols, fontsize=10)
axs[i].text(0.1, 1.55, figpanel, fontsize=18, color='black')
plt.tight_layout()
plt.subplots_adjust(bottom=0.1)
# Get handles and labels from all axes
handles_lables.append(axs[i].get_legend_handles_labels())
plt.savefig("figs/Figure3.pdf", format='pdf', dpi=600)
plt.show()
plt.close()
plotting bar charts for pgxscan_pain using ETH (category), Phenotype (plotting variable), pgxscan_pain (output name)...
# Dictionary of disease categories and corresponding drugs
drug_data = {
"Cancer (Oncology)": ["fluorouracil", "methotrexate", "cyclophosphamide", "cisplatin", "doxorubicin", "etoposide", "paclitaxel", "tamoxifen", "rituximab", "lapatinib", "gefitinib", "oxaliplatin", "daunorubicin", "irinotecan", "gemcitabine"],
"Autoimmune & Inflammatory Disorders": ["adalimumab", "etanercept", "azathioprine", "methotrexate", "hydroxychloroquine", "infliximab", "sulfasalazine", "tacrolimus", "rituximab", "dexamethasone", "prednisone", "mycophenolic acid", "sirolimus"],
"Cardiovascular Diseases": ["metoprolol", "atenolol", "propranolol", "carvedilol", "bisoprolol", "warfarin", "clopidogrel", "dabigatran", "atorvastatin", "simvastatin", "rosuvastatin", "captopril", "enalapril", "furosemide", "spironolactone", "hydrochlorothiazide"],
"Pain Management & Anesthetics": ["ibuprofen", "diclofenac", "naproxen", "celecoxib", "morphine", "oxycodone", "fentanyl", "tramadol", "hydrocodone", "lidocaine", "bupivacaine", "sevoflurane", "isoflurane", "propofol"],
"Neurological & Psychiatric Disorders": ["fluoxetine", "sertraline", "amitriptyline", "escitalopram", "venlafaxine", "duloxetine", "olanzapine", "risperidone", "quetiapine", "haloperidol", "valproic acid", "lamotrigine", "carbamazepine", "gabapentin", "amphetamine", "methylphenidate", "modafinil", "donepezil", "rivastigmine", "galantamine", "tetrabenazine"],
"Infectious Diseases (Antibiotics, Antivirals, Antifungals)": ["ciprofloxacin", "azithromycin", "amoxicillin", "cephalexin", "vancomycin", "acyclovir", "oseltamivir", "fluconazole", "voriconazole", "amphotericin B"],
"Gastrointestinal Disorders": ["omeprazole", "pantoprazole", "ranitidine", "lansoprazole", "metoclopramide", "ondansetron", "mesalazine", "loperamide"],
"Endocrine & Metabolic Disorders": ["metformin", "insulin", "levothyroxine", "propylthiouracil", "methimazole", "glipizide", "glyburide", "pioglitazone"],
"Respiratory Diseases": ["salbutamol", "salmeterol", "fluticasone", "budesonide", "montelukast", "theophylline"],
"Other (Ophthalmology, Dermatology, Rare Diseases, etc.)": ["latanoprost", "timolol", "hydroxychloroquine", "thalidomide", "ivacaftor"]
}
drug_df = pd.DataFrame(
list(drug_data.items()),
columns=["Disease Category", "Drugs"]
)
display.display(drug_df)
| Disease Category | Drugs | |
|---|---|---|
| 0 | Cancer (Oncology) | [fluorouracil, methotrexate, cyclophosphamide,... |
| 1 | Autoimmune & Inflammatory Disorders | [adalimumab, etanercept, azathioprine, methotr... |
| 2 | Cardiovascular Diseases | [metoprolol, atenolol, propranolol, carvedilol... |
| 3 | Pain Management & Anesthetics | [ibuprofen, diclofenac, naproxen, celecoxib, m... |
| 4 | Neurological & Psychiatric Disorders | [fluoxetine, sertraline, amitriptyline, escita... |
| 5 | Infectious Diseases (Antibiotics, Antivirals, ... | [ciprofloxacin, azithromycin, amoxicillin, cep... |
| 6 | Gastrointestinal Disorders | [omeprazole, pantoprazole, ranitidine, lansopr... |
| 7 | Endocrine & Metabolic Disorders | [metformin, insulin, levothyroxine, propylthio... |
| 8 | Respiratory Diseases | [salbutamol, salmeterol, fluticasone, budesoni... |
| 9 | Other (Ophthalmology, Dermatology, Rare Diseas... | [latanoprost, timolol, hydroxychloroquine, tha... |
# Categories and their respective drug counts (estimated from previous analysis)
drug_categories = {
"Cancer (Oncology)": 25,
"Autoimmune & Inflammatory Disorders": 20,
"Cardiovascular Diseases": 22,
"Pain Management & Anesthetics": 18,
"Neurological & Psychiatric Disorders": 24,
"Infectious Diseases (Antibiotics, Antivirals, Antifungals)": 15,
"Gastrointestinal Disorders": 10,
"Endocrine & Metabolic Disorders": 12,
"Respiratory Diseases": 8,
"Other (Ophthalmology, Dermatology, Rare Diseases, etc.)": 6
}
drug_labels = drug_categories.keys()
drug_counts = drug_categories.values()
# Wrap the labels
def wrap_labels(labels, width):
"""Wraps the text of labels to a maximum width."""
wrapped_labels = [textwrap.fill(label, width) for label in labels]
return wrapped_labels
wrapped_labels = wrap_labels(drug_labels, 30) # Adjust the width as needed
# Explode the 1st of the ten wedges (Cancer - index 0)
explode = (0.12, 0, 0, 0, 0, 0, 0, 0, 0, 0) # only "explode" the 2nd slice
explode_cancer = (0.28, 0, 0, 0, 0, 0, 0, 0, 0, 0) # only "explode" the 2nd slice
fig, ax = plt.subplots(figsize=(10, 7), subplot_kw=dict(aspect="equal"))
# add percentages to donut chart
autotexts = ax.pie(drug_counts, wedgeprops=dict(width=0.5), autopct='%1.1f%%', startangle=-40, explode=explode)
# add new donut to include labels and connecting lines
wedges, texts = ax.pie(drug_counts, wedgeprops={'width': 0.6, 'edgecolor': 'white'}, startangle=-40, explode=explode)
bbox_props = dict(boxstyle="square,pad=0.3", fc="w", ec="k", lw=0.72)
kw = dict(arrowprops=dict(arrowstyle="-"),
bbox=bbox_props, zorder=0, va="center")
for i, p in enumerate(wedges):
ang = (p.theta2 - p.theta1)/2. + p.theta1
y = np.sin(np.deg2rad(ang))
x = np.cos(np.deg2rad(ang))
horizontalalignment = {-1: "right", 1: "left"}[int(np.sign(x))]
connectionstyle = f"angle,angleA=0,angleB={ang}"
kw["arrowprops"].update({"connectionstyle": connectionstyle})
ax.annotate(wrapped_labels[i], xy=(x, y), xytext=(1.35*np.sign(x), 1.4*y),
horizontalalignment=horizontalalignment, **kw)
plt.subplots_adjust(top=0.8)
ax.set_title("Proportion of Drugs in CPIC gene-drug pair by Disease Category", y=1.2, pad=-14)
plt.savefig('figs/Figure4.pdf', format='pdf', dpi=600)
plt.show()
plt.close()