Description
Principal Coordinates Analysis.
How different are microbial communities from each other?

‣
Principal Coordinates Analysis.
How different are microbial communities from each other?

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib.patches import Ellipse
import matplotlib.transforms as transforms
import seaborn as sns
from scipy.spatial.distance import pdist, squareform
from adjustText import adjust_text
#load data in
meta = pd.read_csv("metadata.tsv", sep="\t")
mpa_raw = pd.read_csv("merged_metaphlan.tsv", sep="\t", comment="#")
mpa_raw.columns = mpa_raw.columns.str.replace("_1_metaphlan", "", regex=False)
mpa_raw = mpa_raw.rename(columns={mpa_raw.columns[0]: "clade_name"})
#filter to species
species = mpa_raw[
mpa_raw["clade_name"].str.contains("s__") &
~mpa_raw["clade_name"].str.contains("t__")
].copy()
species = species.set_index("clade_name")
species = species[meta["SampleID"].tolist()] # keep our 14 samples
#Bray-Curtis distance matrix
#samples rows, species columns
mat = species.T.values # shape: (14 samples, 491 species)
mat = mat / 100.0
dist_vec = pdist(mat, metric="braycurtis")
dist_sq = squareform(dist_vec) # 14×14 distance matrix
#PCoA via double-centering (classical MDS)
n = dist_sq.shape[0]
D2 = dist_sq ** 2
J = np.eye(n) - np.ones((n, n)) / n
B = -0.5 * J @ D2 @ J
eigenvalues, eigenvectors = np.linalg.eigh(B)
idx = np.argsort(eigenvalues)[::-1] # descending
eigenvalues = eigenvalues[idx]
eigenvectors = eigenvectors[:, idx]
# Keep only positive eigenvalues
pos = eigenvalues > 1e-10
coords = eigenvectors[:, pos] * np.sqrt(eigenvalues[pos])
var_exp = eigenvalues[pos] / eigenvalues[pos].sum() * 100
pc1, pc2 = coords[:, 0], coords[:, 1]
pct1, pct2 = var_exp[0], var_exp[1]
#plot dataframe
plot_df = meta.copy()
plot_df["PC1"] = pc1
plot_df["PC2"] = pc2
#colours & markers
type_colors = {
"Sludge": "#7B4F2E",
"Water": "#2166AC",
"Biofilm": "#6BAB6E",
}
time_markers = {
"May2024": "s", # square
"July2025": "o", # circle
}
#ellipse
def confidence_ellipse(x, y, ax, n_std=2.0, **kwargs):
"""Draw a covariance ellipse around a group of points."""
if len(x) < 3:
return
cov = np.cov(x, y)
vals, vecs = np.linalg.eigh(cov)
order = vals.argsort()[::-1]
vals, vecs = vals[order], vecs[:, order]
angle = np.degrees(np.arctan2(*vecs[:, 0][::-1]))
width = 2 * n_std * np.sqrt(vals[0])
height = 2 * n_std * np.sqrt(vals[1])
e = Ellipse(
xy=(np.mean(x), np.mean(y)),
width=width, height=height,
angle=angle, **kwargs
)
ax.add_patch(e)
#plot
sns.set_theme(style="white", font="sans-serif")
fig, ax = plt.subplots(figsize=(9, 7))
# Draw confidence ellipses per sample type (background)
for stype, color in type_colors.items():
grp = plot_df[plot_df["Type"] == stype]
confidence_ellipse(
grp["PC1"].values, grp["PC2"].values, ax,
n_std=1.5,
facecolor=color, alpha=0.12,
edgecolor=color, linewidth=1.5, linestyle="--"
)
texts = [] # adjustText
# Plot each sample
for _, row in plot_df.iterrows():
color = type_colors[row["Type"]]
marker = time_markers[row["TimePoint"]]
ax.scatter(
row["PC1"], row["PC2"],
color=color, marker=marker,
s=120, edgecolors="white", linewidths=0.8,
zorder=3
)
# Sample label: short description
short = row["Description"].replace("Aquarium", "Aq.").replace("aquarium", "aq.")
t = ax.text(
row["PC1"], row["PC2"],
short,
fontsize=8.5, color="#222222", fontweight="bold",
zorder=4,
bbox=dict(boxstyle="round,pad=0.25", facecolor="white",
edgecolor="none", alpha=0.75)
)
texts.append(t)
adjust_text(
texts,
x=plot_df["PC1"].tolist(), # where the X dots are
y=plot_df["PC2"].tolist(), # where the Y dots are
ax=ax,
expand_points=(4.0, 4.0),
expand_text=(2.5, 2.5),
force_points=(2.5, 2.5), # push away from the dots
arrowprops=dict(arrowstyle="-", color="grey", lw=0.6, alpha=0.7)
)
#axes, labels
ax.axhline(0, color="grey", linewidth=0.5, linestyle=":")
ax.axvline(0, color="grey", linewidth=0.5, linestyle=":")
ax.set_xlabel(f"PC1 ({pct1:.1f}% variance)", fontsize=11, fontweight="bold")
ax.set_ylabel(f"PC2 ({pct2:.1f}% variance)", fontsize=11, fontweight="bold")
ax.set_title(
"Gowanus Biofilm BioBAT — Beta Diversity (PCoA)",
fontsize=13, fontweight="bold", pad=12
)
ax.text(
0.5, 0.99,
"Bray-Curtis dissimilarity · MetaPhlAn 4 species-level",
transform=ax.transAxes, ha="center", fontsize=9, color="grey"
)
ax.spines[["top", "right"]].set_visible(False)
ax.spines[["left", "bottom"]].set_color("grey")
#legends
#sample type (colour)
type_handles = [
mpatches.Patch(color=col, label=stype)
for stype, col in type_colors.items()
]
time_handles = [
plt.scatter([], [], marker=m, color="grey", s=80, label=tp)
for tp, m in time_markers.items()
]
leg1 = ax.legend(
handles=type_handles,
title="Sample Type", title_fontsize=9,
fontsize=8, loc="upper right",
bbox_to_anchor=(1.00, 0.98), # top-right corner, away from title
frameon=True, framealpha=0.9, edgecolor="lightgrey"
)
ax.add_artist(leg1)
ax.legend(
handles=time_handles,
title="Time Point", title_fontsize=9,
fontsize=8, loc="lower right",
bbox_to_anchor=(1.00, 0.72), # bottom-right corner, away from axes
frameon=True, framealpha=0.9, edgecolor="lightgrey"
)
plt.tight_layout()
#save
plt.savefig("pcoa_betadiversity.pdf", dpi=300, bbox_inches="tight")
plt.savefig("pcoa_betadiversity.png", dpi=300, bbox_inches="tight")
print(f"✓ Saved: pcoa_betadiversity.pdf and pcoa_betadiversity.png")
print(f"\n── Variance explained ──")
for i, pct in enumerate(var_exp[:4]):
print(f" PC{i+1}: {pct:.1f}%")
print(f"\n── Bray-Curtis distance matrix ──")
dist_df = pd.DataFrame(dist_sq, index=meta["SampleID"], columns=meta["SampleID"])
print(dist_df.round(3).to_string())