Example 4: Abundance Analysis - NEIU Data Science Workshop 2025

Import Libraries and Install Dependencies¶

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

Load the data¶

Fert = pd.read_csv("Combinedsp24_2_Fert_gottcha2.full.tsv", sep="\t")
FertSp = Fert[Fert['LEVEL'] == "species"]

NoFert = pd.read_csv("Combinedsp24_2_NoFert_gottcha2.full.tsv", sep="\t")
NoFertSp = NoFert[NoFert['LEVEL'] == "species"]

Regression analysis¶

# Selecting columns
Fertspm = FertSp.iloc[:, [1, 9, 15, 16]]  # Adjust for zero-based index
NoFertspm = NoFertSp.iloc[:, [1, 9, 15, 16]]

# Merging data
Combined = pd.merge(Fertspm, NoFertspm, on="NAME", how="outer").fillna(0)

# Remove specific species
Combined = Combined[Combined['NAME'] != "Streptomyces griseofuscus"]

# Linear regression model
X = Combined['REL_ABUNDANCE_x']
y = Combined['REL_ABUNDANCE_y']
X_const = sm.add_constant(X)  # Add constant for intercept
model = sm.OLS(y, X_const).fit()

# Predict values and confidence intervals
x_values = np.linspace(X.min(), X.max(), 100)
x_values_const = sm.add_constant(x_values)
y_pred = model.predict(x_values_const)
predictions = model.get_prediction(x_values_const)
confidence_interval = predictions.conf_int(alpha=0.05)

# Calculate residuals
Combined['residual'] = y - model.predict(X_const)
Combined['abs_residual'] = Combined['residual'].abs()

# Get the 10 largest outliers
largest_outliers = Combined.nlargest(10, 'abs_residual')

# Plot
sns.set(style="whitegrid")
fig, ax = plt.subplots(figsize=(10, 6))

# Scatter plot
ax.scatter(X, y, c='black', label="Data")

# Regression line
ax.plot(x_values, y_pred, color='red', label="Regression Line")

# Confidence interval
ax.fill_between(
    x_values,
    confidence_interval[:, 0],  # Lower bound
    confidence_interval[:, 1],  # Upper bound
    color='red',
    alpha=0.2,
    label="95% Confidence Interval"
)
# Customize plot
ax.set_xlabel("Relative Abundance 1% Fertilizer")
ax.set_ylabel("Relative Abundance No Fertilizer")
ax.set_title("Regression Plot with 95% Confidence Interval and Largest Outliers")
ax.legend()

plt.show()

# Improve plot by labeling the 10 greatest outliers
sns.set(style="whitegrid")
fig, ax = plt.subplots(figsize=(10, 6))

# Scatter plot
ax.scatter(X, y, c='black', label="Data")

# Regression line
ax.plot(x_values, y_pred, color='red', label="Regression Line")

# Confidence interval
ax.fill_between(
    x_values,
    confidence_interval[:, 0],  # Lower bound
    confidence_interval[:, 1],  # Upper bound
    color='red',
    alpha=0.2,
    label="95% Confidence Interval"
)

# Label largest outliers
for _, row in largest_outliers.iterrows():
    ax.annotate(
        row['NAME'],  # Label to display
        (row['REL_ABUNDANCE_x'], row['REL_ABUNDANCE_y']),  # Position of the point
        fontsize=8,
        xytext=(5, 5),
        textcoords="offset points",
    )

# Customize plot
ax.set_xlabel("Relative Abundance 1% Fertilizer")
ax.set_ylabel("Relative Abundance No Fertilizer")
ax.set_title("Regression Plot with 95% Confidence Interval and Largest Outliers")
ax.legend()

# Show plot
plt.show()

KBase (Biology)

Example 3: Clustering Analysis

AI/ML

01 - Compare Various ML/AI Techniques to Diabetes Data