Standardization is important in PCA since the latter is a variance maximizing exercise. It projects your original data onto directions which maximize the variance. If your features have different scales, then this projection may get screwed!


# Main imports
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns

%matplotlib inline
# Load data
import os
url = ""
in_file = os.path.basename(url)
df = pd.read_csv(in_file if os.path.exists(in_file) else url)
del df["Unnamed: 0"]
# Sklearn imports
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
# Fit PCA pipeline with and without scaling / standardization
n_components = 4
pipelines = {}
for with_scaling in [True, False]:
    name = "with%s scaling" % ("" if with_scaling else "out")
    steps = []
    if with_scaling:
        name = "with scaling"
        steps.append(("scaler", StandardScaler()))
        name = "without scaling"
    steps.append(("pca", PCA(n_components=n_components)))
    pipeline = Pipeline(steps).fit(df)
    pipelines[name] = pipeline
# Plot the results
def plot_pipelines(pipelines):
    _, axes = plt.subplots(len(pipelines), 2)
    for i, (row, (name, pipeline)) in enumerate(zip(axes.T, pipelines.items())):
        pca = pipeline.steps[-1][1]
        ax2, ax1 = row
        ax1.matshow(pca.get_covariance(), cmap="viridis")
   , pca.explained_variance_)
        ax2.set_xticks(.5 + np.arange(n_components))
        ax2.set_xticklabels(["PC%i" % (c + 1)
                             for c in range(n_components)])
        if i == 0:
            ax2.set_ylabel("explained variance")