import numpy as np
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from tqdm import tqdm_notebook
import IPython.display as display
import textwrap

import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")

%matplotlib inline


# Load data
afr = "pca-afr.eigenvec-sorted.xlsx"
gmp_a = pd.read_excel(afr, sheet_name="pca-data")
gmp = gmp_a
eth_order = ["GMB","GWD","SEN","MAL","MSL","GUI","BKF","CIV","GHA","BEN","YRI","ESN","BER","CAM","DRC","UGN","LWK","ZAM","MLW","BWN","RSA"]
gmp.head()


# Plot PCA and kernel densities
fig, (axs1, axs2) = plt.subplots(2,1, figsize=(11,15))
cnt = sns.kdeplot(data=gmp, hue="CLUSTER", x="PC1", y="PC2", thresh=0.05, levels=4, color=".2", alpha=0.5, fill=False, ax=axs1)
sct = sns.scatterplot(data=gmp, hue="Population", x="PC1", y="PC2", ax=axs1)
kde = sns.kdeplot(data=gmp, hue="CLUSTER", x="PC1", y="PC2", thresh=0.05, fill=True, ax=axs2)
plt.subplots_adjust(right=0.75, bottom=0.2)
sns.move_legend(sct, "upper left", bbox_to_anchor=(1, 1), fontsize=8.5, title="Population")
sns.move_legend(kde, "upper left", bbox_to_anchor=(1, 1), fontsize=13, title="Cluster")
axs1.text(-0.05, 0.05, "A", fontsize=18, color='black')
axs2.text(-0.05, 0.05, "B", fontsize=18, color='black')
plt.savefig('figs/Figure1.pdf', format='pdf', dpi=600)
plt.show()
plt.close()


# Plot MAFs
wgs_clst1_pops = pd.read_excel(
    'Source_Data.xlsx', 
    sheet_name="AF-merged-pops-cat1"
)

wgs_clst2_pops = pd.read_excel(
    'Source_Data.xlsx', 
    sheet_name="AF-merged-pops-cat2"
)

wgs_clst3_pops = pd.read_excel(
    'Source_Data.xlsx', 
    sheet_name="AF-merged-pops-cat3"
)

plt_namea = 'figs/' + "wgs_clst_pops_cat1.png"
plt_nameb = 'figs/' + "wgs_clst_pops_cat2.png"
plt_namec = 'figs/' + "wgs_clst_pops_cat3.png"

data1 = wgs_clst1_pops
df1 = data1.set_index('SNP')

data2 = wgs_clst2_pops
df2 = data2.set_index('SNP')

data3 = wgs_clst3_pops
df3 = data3.set_index('SNP')

fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2)
fig.delaxes(ax4)

plt1 = df1.plot(
    ax=ax1,
    kind="bar", 
    width=0.7, 
    alpha=0.90, 
    in_layout=True, 
    figsize = (15, 10), 
    subplots = False,
    layout = (2,2),
    style="--",
    legend=False,
    rot=60,
    fontsize=14
)
plt1.grid(True, alpha = 0.2)

plt2 = df2.plot(
    ax=ax2,
    kind="bar", 
    width=0.7, 
    alpha=0.90, 
    in_layout=True, 
    figsize = (15, 10), 
    subplots = False,
    layout = (2,2),
    style="--",
    legend=False,
    rot=60,
    fontsize=14
)
plt2.grid(True, alpha = 0.2)

plt3 = df3.plot(
    ax=ax3,
    kind="bar", 
    width=0.7, 
    alpha=0.90, 
    in_layout=True, 
    figsize = (15, 10), 
    subplots = False,
    layout = (2,2),
    style="--",
    legend=False,
    rot=60,
    fontsize=14
)
plt3.grid(True, alpha = 0.2)

# Since all plots have same legend, use that for plt1
handles1, labels1 = ax1.get_legend_handles_labels()
all_handles = handles1
all_labels = labels1

# Create a single legend for the entire figure
fig.legend(all_handles, all_labels, loc='lower right', bbox_to_anchor=(0.9, 0.15), ncol=1, fontsize=18)

plt.subplots_adjust(right=0.90, left=0.80, bottom=0, top=0.9)
fig.tight_layout()
plt.show()
plt.close()


wgs_clst1_clusters = pd.read_excel(
    'Source_Data.xlsx', 
    sheet_name="AF-merged-clusters-cat1"
)

wgs_clst2_clusters = pd.read_excel(
    'Source_Data.xlsx', 
    sheet_name="AF-merged-clusters-cat2"
)

wgs_clst3_clusters = pd.read_excel(
    'Source_Data.xlsx', 
    sheet_name="AF-merged-clusters-cat3"
)

plt_name = 'figs/' + "Figure2.pdf"


data1 = wgs_clst1_clusters
df1 = data1.set_index('SNP')

data2 = wgs_clst2_clusters
df2 = data2.set_index('SNP')

data3 = wgs_clst3_clusters
df3 = data3.set_index('SNP')

fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2)
fig.delaxes(ax4)

plt1 = df1.plot(
    ax=ax1,
    kind="bar", 
    width=0.7, 
    alpha=0.80, 
    in_layout=True, 
    figsize = (15, 10), 
    subplots = False,
    layout = (2,2),
    style="--",
    legend=False,
    rot=60,
    fontsize=14
)
plt1.grid(True, alpha = 0.2)
plt1.text(0, 0.14, 'A', fontsize=18, color='black')
ax1.set_xlabel("")
ax1.set_ylabel("Allele frequency", fontsize=14)

plt2 = df2.plot(
    ax=ax2,
    kind="bar", 
    width=0.7, 
    alpha=0.80, 
    in_layout=True, 
    figsize = (15, 10), 
    subplots = False,
    layout = (2,2),
    style="--",
    legend=False,
    rot=60,
    fontsize=14
)
plt2.grid(True, alpha = 0.2)
plt2.text(0.1, 0.22, 'B', fontsize=18, color='black')
ax2.set_xlabel("SNP", fontsize=14)

plt3 = df3.plot(
    ax=ax3,
    kind="bar", 
    width=0.7, 
    alpha=0.80, 
    in_layout=True, 
    figsize = (15, 10), 
    subplots = False,
    layout = (2,2),
    style="--",
    legend=False,
    rot=60,
    fontsize=14
)
plt3.grid(True, alpha = 0.2)
ax3.set_xlabel("SNP", fontsize=14)
ax3.set_ylabel("Allele frequency", fontsize=14)

plt3.text(0, 0.32, 'C', fontsize=18, color='black')

# Since all plots have same legend, use that for plt1
handles1, labels1 = ax1.get_legend_handles_labels()
all_handles = handles1
all_labels = labels1


# Create a single legend for the entire figure
fig.legend(all_handles, all_labels, loc='lower right', bbox_to_anchor=(0.8, 0.15), ncol=1, fontsize=18)

plt.subplots_adjust(bottom=0.2, top=0.9)
fig.tight_layout()
plt.savefig(plt_name, format='pdf', dpi=600)
plt.show()
plt.close()


# Load data
pgxscan_pain = pd.read_excel("reports/pgxscan-merged-reports-pop.xlsx", sheet_name="pain-diplotype-report")
pgxscan_pain.head()


# list of relevant phenotypes
functional_phenotypes = [
    'ultrarapid_metabolizer',
    'intermediate_metabolizer',
    'poor_function/metabolizer',
    'poor_metabolizer',
    'poor_function',
    'decreased_function',
    'increased_function'
]


# function to extract AFR pops from data
def get_afr_data(data):
    """extract AFR populations from data"""
    data_afr = data[data['POP'] == "AFR"]
    # get uniq gene names from data
    genes = data_afr['Gene'].unique()
    # initialize and get genes that have at least one relevant phenotype
    genes_with_pheno = []
    for gene in genes:
        df = data_afr[data_afr['Gene'] == gene]
        phenotypes = df['Phenotype'].unique()
        if any(pheno in phenotypes for pheno in functional_phenotypes):
            genes_with_pheno.append(gene)
            genes_with_pheno_df = data_afr[data_afr['Gene'].str.contains('|'.join(genes_with_pheno), case=True, na=False)]
            
    return genes_with_pheno_df


pgxscan_afr = get_afr_data(pgxscan_pain)
pgxscan_afr.head()


from itertools import chain
from termcolor import colored
# plot for all datasets, saving the datasets, category and variable in a tuple,
# all placed within a dictionary with name to serve as plot name

datasets = {
    ('ETH','Phenotype','pgxscan_pain'):pgxscan_pain
}

for item, data in zip(datasets, datasets.values()):
    c_name = list(item)[0]
    v_name = list(item)[1]
    p_name = list(item)[2]
    print(
        "plotting bar charts " + "for " + 
        p_name + " using " + c_name + 
        " (category), " + v_name + 
        " (plotting variable), " + 
        p_name + " (output name)..."
    )
    uniq_genes = list(set(data.Gene))
    panel_list = list("ABCD")
    fig, axs = plt.subplots(2,2)
    axs = axs.flatten()
    handles_lables = []
    
    for i, (gene, figpanel) in enumerate(zip(uniq_genes, panel_list)):
        gene_df = data[data['Gene'] == gene]
        df = pd.crosstab(gene_df[c_name], gene_df[v_name])
        df_perc = df.iloc[:, :].apply(lambda x: x / x.sum(), axis=1) * 100
        df_perc.plot(
            ax=axs[i], 
            kind="barh", 
            stacked=True, 
            figsize=(12,11), 
            title=gene, 
            legend=None, 
            sharex=False,
            sharey=True
        )
        axs[i].grid(True, alpha = 0.2)
        axs[i].set_title(gene.upper(), style='italic')
        
        # ADD LEGEND
        nvars = len(gene_df[v_name].unique())
        if nvars <= 8:
            ncols = 2
        elif nvars > 8 and nvars <= 12:
            ncols = 3
        else:
            ncols = 4            
        axs[i].legend(loc='upper center', bbox_to_anchor=(0.5, -0.06), ncol=ncols, fontsize=10)
        axs[i].text(0.1, 1.55, figpanel, fontsize=18, color='black')
        plt.tight_layout()
        plt.subplots_adjust(bottom=0.1)
        
        # Get handles and labels from all axes
        handles_lables.append(axs[i].get_legend_handles_labels())

plt.savefig("figs/Figure3.pdf", format='pdf', dpi=600)
plt.show()
plt.close()

plotting bar charts for pgxscan_pain using ETH (category), Phenotype (plotting variable), pgxscan_pain (output name)...


# Dictionary of disease categories and corresponding drugs
drug_data = {
    "Cancer (Oncology)": ["fluorouracil", "methotrexate", "cyclophosphamide", "cisplatin", "doxorubicin", "etoposide", "paclitaxel", "tamoxifen", "rituximab", "lapatinib", "gefitinib", "oxaliplatin", "daunorubicin", "irinotecan", "gemcitabine"],
    "Autoimmune & Inflammatory Disorders": ["adalimumab", "etanercept", "azathioprine", "methotrexate", "hydroxychloroquine", "infliximab", "sulfasalazine", "tacrolimus", "rituximab", "dexamethasone", "prednisone", "mycophenolic acid", "sirolimus"],
    "Cardiovascular Diseases": ["metoprolol", "atenolol", "propranolol", "carvedilol", "bisoprolol", "warfarin", "clopidogrel", "dabigatran", "atorvastatin", "simvastatin", "rosuvastatin", "captopril", "enalapril", "furosemide", "spironolactone", "hydrochlorothiazide"],
    "Pain Management & Anesthetics": ["ibuprofen", "diclofenac", "naproxen", "celecoxib", "morphine", "oxycodone", "fentanyl", "tramadol", "hydrocodone", "lidocaine", "bupivacaine", "sevoflurane", "isoflurane", "propofol"],
    "Neurological & Psychiatric Disorders": ["fluoxetine", "sertraline", "amitriptyline", "escitalopram", "venlafaxine", "duloxetine", "olanzapine", "risperidone", "quetiapine", "haloperidol", "valproic acid", "lamotrigine", "carbamazepine", "gabapentin", "amphetamine", "methylphenidate", "modafinil", "donepezil", "rivastigmine", "galantamine", "tetrabenazine"],
    "Infectious Diseases (Antibiotics, Antivirals, Antifungals)": ["ciprofloxacin", "azithromycin", "amoxicillin", "cephalexin", "vancomycin", "acyclovir", "oseltamivir", "fluconazole", "voriconazole", "amphotericin B"],
    "Gastrointestinal Disorders": ["omeprazole", "pantoprazole", "ranitidine", "lansoprazole", "metoclopramide", "ondansetron", "mesalazine", "loperamide"],
    "Endocrine & Metabolic Disorders": ["metformin", "insulin", "levothyroxine", "propylthiouracil", "methimazole", "glipizide", "glyburide", "pioglitazone"],
    "Respiratory Diseases": ["salbutamol", "salmeterol", "fluticasone", "budesonide", "montelukast", "theophylline"],
    "Other (Ophthalmology, Dermatology, Rare Diseases, etc.)": ["latanoprost", "timolol", "hydroxychloroquine", "thalidomide", "ivacaftor"]
}

drug_df = pd.DataFrame(
    list(drug_data.items()), 
    columns=["Disease Category", "Drugs"]
)

display.display(drug_df)


# Categories and their respective drug counts (estimated from previous analysis)
drug_categories = {
    "Cancer (Oncology)": 25,
    "Autoimmune & Inflammatory Disorders": 20,
    "Cardiovascular Diseases": 22,
    "Pain Management & Anesthetics": 18,
    "Neurological & Psychiatric Disorders": 24,
    "Infectious Diseases (Antibiotics, Antivirals, Antifungals)": 15,
    "Gastrointestinal Disorders": 10,
    "Endocrine & Metabolic Disorders": 12,
    "Respiratory Diseases": 8,
    "Other (Ophthalmology, Dermatology, Rare Diseases, etc.)": 6
}

drug_labels = drug_categories.keys()
drug_counts = drug_categories.values()

# Wrap the labels
def wrap_labels(labels, width):
    """Wraps the text of labels to a maximum width."""
    wrapped_labels = [textwrap.fill(label, width) for label in labels]
    return wrapped_labels
wrapped_labels = wrap_labels(drug_labels, 30)  # Adjust the width as needed

# Explode the 1st of the ten wedges (Cancer - index 0)
explode = (0.12, 0, 0, 0, 0, 0, 0, 0, 0, 0)  # only "explode" the 2nd slice
explode_cancer = (0.28, 0, 0, 0, 0, 0, 0, 0, 0, 0)  # only "explode" the 2nd slice

fig, ax = plt.subplots(figsize=(10, 7), subplot_kw=dict(aspect="equal"))

# add percentages to donut chart
autotexts = ax.pie(drug_counts, wedgeprops=dict(width=0.5), autopct='%1.1f%%', startangle=-40, explode=explode)

# add new donut to include labels and connecting lines
wedges, texts = ax.pie(drug_counts, wedgeprops={'width': 0.6, 'edgecolor': 'white'}, startangle=-40, explode=explode)

bbox_props = dict(boxstyle="square,pad=0.3", fc="w", ec="k", lw=0.72)
kw = dict(arrowprops=dict(arrowstyle="-"),
          bbox=bbox_props, zorder=0, va="center")

for i, p in enumerate(wedges):
    ang = (p.theta2 - p.theta1)/2. + p.theta1
    y = np.sin(np.deg2rad(ang))
    x = np.cos(np.deg2rad(ang))
    horizontalalignment = {-1: "right", 1: "left"}[int(np.sign(x))]
    connectionstyle = f"angle,angleA=0,angleB={ang}"
    kw["arrowprops"].update({"connectionstyle": connectionstyle})
    ax.annotate(wrapped_labels[i], xy=(x, y), xytext=(1.35*np.sign(x), 1.4*y),
                horizontalalignment=horizontalalignment, **kw)

plt.subplots_adjust(top=0.8)
ax.set_title("Proportion of Drugs in CPIC gene-drug pair by Disease Category", y=1.2, pad=-14)
plt.savefig('figs/Figure4.pdf', format='pdf', dpi=600)
plt.show()
plt.close()

	FID	IID	PC1	PC2	PC3	PC4	PC5	PC6	PC7	PC8	PC9	PC10	ETH	Population	CLUSTERa	CLUSTERb	CLUSTER	POP
0	SC_GMFUL5306349	SC_GMFUL5306349	0.024255	-0.006405	0.020890	-0.009007	0.010061	-0.004022	-0.007109	-0.007006	-0.012017	0.005776	GMB	Gambia (GGVP; N=253)	CLUSTER1 (N=402)	CLUSTER1	CLUSTER1 (N=350)	AFR
1	SC_GMFUL5306352	SC_GMFUL5306352	0.014896	0.011660	0.001984	-0.004511	0.024223	-0.016878	-0.017941	0.002425	-0.022156	-0.001535	GMB	Gambia (GGVP; N=253)	CLUSTER1 (N=402)	CLUSTER1	CLUSTER1 (N=350)	AFR
2	SC_GMFUL5306355	SC_GMFUL5306355	0.028940	-0.008987	0.017486	-0.009400	0.006759	-0.006706	-0.003040	0.009046	-0.006762	-0.000610	GMB	Gambia (GGVP; N=253)	CLUSTER1 (N=402)	CLUSTER1	CLUSTER1 (N=350)	AFR
3	SC_GMFUL5306360	SC_GMFUL5306360	0.020439	-0.000084	0.022278	-0.012062	0.020236	-0.012620	-0.013186	0.003586	-0.010011	0.008572	GMB	Gambia (GGVP; N=253)	CLUSTER1 (N=402)	CLUSTER1	CLUSTER1 (N=350)	AFR
4	SC_GMFUL5306371	SC_GMFUL5306371	0.023628	0.006352	-0.004576	0.002971	0.016885	-0.004671	-0.014428	-0.014132	-0.016898	-0.002277	GMB	Gambia (GGVP; N=253)	CLUSTER1 (N=402)	CLUSTER1	CLUSTER1 (N=350)	AFR

	KEY	Gene	name	hap1_main	hap2_main	Diplotype_main	Phenotype	ETH	POP
0	C1_cyp2b6	cyp2b6	C1	*1	*6	1/6	normal_metabolizer	NONSCD	AFR
1	C101_cyp2b6	cyp2b6	C101	*1	*1	1/1	normal_metabolizer	NONSCD	AFR
2	C102_cyp2b6	cyp2b6	C102	*1	*22	1/22	ultrarapid_metabolizer	NONSCD	AFR
3	C103_cyp2b6	cyp2b6	C103	*1	*6	1/6	normal_metabolizer	NONSCD	AFR
4	C109_cyp2b6	cyp2b6	C109	*1	*6	1/6	normal_metabolizer	NONSCD	AFR

	KEY	Gene	name	hap1_main	hap2_main	Diplotype_main	Phenotype	ETH	POP
0	C1_cyp2b6	cyp2b6	C1	*1	*6	1/6	normal_metabolizer	NONSCD	AFR
1	C101_cyp2b6	cyp2b6	C101	*1	*1	1/1	normal_metabolizer	NONSCD	AFR
2	C102_cyp2b6	cyp2b6	C102	*1	*22	1/22	ultrarapid_metabolizer	NONSCD	AFR
3	C103_cyp2b6	cyp2b6	C103	*1	*6	1/6	normal_metabolizer	NONSCD	AFR
4	C109_cyp2b6	cyp2b6	C109	*1	*6	1/6	normal_metabolizer	NONSCD	AFR

	Disease Category	Drugs
0	Cancer (Oncology)	[fluorouracil, methotrexate, cyclophosphamide,...
1	Autoimmune & Inflammatory Disorders	[adalimumab, etanercept, azathioprine, methotr...
2	Cardiovascular Diseases	[metoprolol, atenolol, propranolol, carvedilol...
3	Pain Management & Anesthetics	[ibuprofen, diclofenac, naproxen, celecoxib, m...
4	Neurological & Psychiatric Disorders	[fluoxetine, sertraline, amitriptyline, escita...
5	Infectious Diseases (Antibiotics, Antivirals, ...	[ciprofloxacin, azithromycin, amoxicillin, cep...
6	Gastrointestinal Disorders	[omeprazole, pantoprazole, ranitidine, lansopr...
7	Endocrine & Metabolic Disorders	[metformin, insulin, levothyroxine, propylthio...
8	Respiratory Diseases	[salbutamol, salmeterol, fluticasone, budesoni...
9	Other (Ophthalmology, Dermatology, Rare Diseas...	[latanoprost, timolol, hydroxychloroquine, tha...

PCA and KDE plotting¶

Allele frequency plots of PGx variants among global poulations¶

Allele frequency plots of PGx variants among population clusters in Sub-Saharan Africa¶

Analyze CPIC drug list and disease categoris¶

PCA and KDE plotting¶

Allele frequency plots of PGx variants among global poulations¶

Allele frequency plots of PGx variants among population clusters in Sub-Saharan Africa¶

Barplots of phenotypes in pain-related PGx genes in SCD vs non-SCD individuals¶

Analyze CPIC drug list and disease categoris¶