Estatística para Cientistas de Dados

Carregando, aguarde alguns segundos.

8 - Aprendizado não supervisionado

8.1 - Preparação dos dados

8.1.1 - Importação dos pacotes Python

import math
from pathlib import Path
import pandas as pd
import numpy as np
#
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
#
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from scipy.stats import multivariate_normal
#
import prince
#
import matplotlib.pyplot as plt
from matplotlib import cm 
from matplotlib.colors import from_levels_and_colors
import seaborn as sns

8.1.2 - Diretório de dados

O diretório DATA contém os arquivos .csv utilizados nos exemplos.

DATA = './'

8.1.3 - Caminhos dos conjuntos de dados

Se você não mantiver seus dados no mesmo diretório que o código, adapte os nomes dos caminhos.

SP500_DATA_CSV = DATA + 'sp500_data.csv.gz'
SP500_SECTORS_CSV = DATA + 'sp500_sectors.csv'
LOAN_DATA_CSV = DATA + 'loan_data.csv.gz'
HOUSE_TASKS_CSV = DATA + 'housetasks.csv'

8.2 - Principal Components Analysis

8.2.1 - A simple example

sp500_px = pd.read_csv(SP500_DATA_CSV, index_col=0)
oil_px = sp500_px[['XOM', 'CVX']]
print(oil_px.head())
#
pcs = PCA(n_components=2)
pcs.fit(oil_px)
loadings = pd.DataFrame(pcs.components_, columns=oil_px.columns)
print(loadings)

                 XOM       CVX
1993-01-29 -0.016991  0.072921
1993-02-01  0.016991  0.102089
1993-02-02  0.084954  0.029168
1993-02-03  0.067964  0.058337
1993-02-04  0.034378  0.044272
        XOM       CVX
0 -0.664711 -0.747101
1  0.747101 -0.664711

def abline(slope, intercept, ax):
    """Calculate coordinates of a line based on slope and intercept"""
    x_vals = np.array(ax.get_xlim())
    return (x_vals, intercept + slope * x_vals)

ax = oil_px.plot.scatter(x='XOM', y='CVX', alpha=0.3, figsize=(4, 4))
ax.set_xlim(-3, 3)
ax.set_ylim(-3, 3)
ax.plot(*abline(loadings.loc[0, 'CVX'] / loadings.loc[0, 'XOM'], 0, ax),
        '--', color='C1')
ax.plot(*abline(loadings.loc[1, 'CVX'] / loadings.loc[1, 'XOM'], 0, ax),
        '--', color='C1')

plt.tight_layout()
plt.show()

8.2.2 - Interpreting principal components

syms = sorted(['AAPL', 'MSFT', 'CSCO', 'INTC', 'CVX', 'XOM', 'SLB', 'COP',
        'JPM', 'WFC', 'USB', 'AXP', 'WMT', 'TGT', 'HD', 'COST'])
top_sp = sp500_px.loc[sp500_px.index >= '2011-01-01', syms]

sp_pca = PCA()
sp_pca.fit(top_sp)

explained_variance = pd.DataFrame(sp_pca.explained_variance_)
ax = explained_variance.head(10).plot.bar(legend=False, figsize=(4, 4))
ax.set_xlabel('Component')

plt.tight_layout()
plt.show()

loadings = pd.DataFrame(sp_pca.components_[0:5, :], 
                        columns=top_sp.columns)
print(loadings)

       AAPL       AXP       COP      COST      CSCO       CVX        HD      INTC       JPM      MSFT       SLB       TGT       USB       WFC       WMT       XOM
0 -0.300825 -0.246332 -0.261529 -0.273634 -0.064059 -0.444490 -0.207983 -0.076956 -0.196397 -0.105012 -0.481786 -0.148833 -0.116421 -0.145684 -0.122304 -0.317952
1 -0.505116 -0.139426  0.174212 -0.416307 -0.031939  0.289373 -0.278002 -0.033898 -0.040723 -0.053954  0.472494 -0.228123 -0.054796 -0.047427 -0.222889  0.154192
2 -0.786730  0.135458 -0.002367  0.465862 -0.007524  0.082374  0.166320 -0.003518  0.062261  0.016248 -0.194822  0.160833  0.048976  0.041932  0.175806  0.090167
3 -0.120586  0.061814 -0.206026  0.092596  0.003904 -0.577665  0.162814 -0.001605  0.057687 -0.012558  0.680914  0.109895  0.016752  0.018614  0.058439 -0.295204
4  0.111576 -0.596666 -0.005813  0.555529 -0.039860  0.109016 -0.185488 -0.072047 -0.385160 -0.077135  0.181332 -0.055557 -0.155440 -0.216425  0.091541  0.013277

maxPC = 1.01 * loadings.loc[0:5, :].abs().to_numpy().max()

f, axes = plt.subplots(5, 1, figsize=(5, 5), sharex=True)

for i, ax in enumerate(axes):
    pc_loadings = loadings.loc[i, :]
    colors = ['C0' if l > 0 else 'C1' for l in pc_loadings]
    ax.axhline(color='#888888')
    pc_loadings.plot.bar(ax=ax, color=colors)
    ax.set_ylabel(f'PC{i+1}')
    ax.set_ylim(-maxPC, maxPC)

plt.tight_layout()
plt.show()

8.2.3 - Correspondence analysis

housetasks = pd.read_csv(HOUSE_TASKS_CSV, index_col=0)

ca = prince.CA(n_components=2)
ca = ca.fit(housetasks)

ca.plot_coordinates(housetasks, figsize=(6, 6))
plt.tight_layout()
plt.show()

8.3 - K-Means Clustering

8.3.1 - A Simple Example

df = sp500_px.loc[sp500_px.index >= '2011-01-01', ['XOM', 'CVX']]
kmeans = KMeans(n_clusters=4).fit(df)
df['cluster'] = kmeans.labels_
print(df.head())
centers = pd.DataFrame(kmeans.cluster_centers_, columns=['XOM', 'CVX'])
print(centers)

                 XOM       CVX  cluster
2011-01-03  0.736805  0.240681        0
2011-01-04  0.168668 -0.584516        3
2011-01-05  0.026631  0.446985        0
2011-01-06  0.248558 -0.919751        3
2011-01-07  0.337329  0.180511        0
        XOM       CVX
0  0.241016  0.334213
1  0.956863  1.370889
2 -1.137958 -1.746642
3 -0.328125 -0.564848

fig, ax = plt.subplots(figsize=(4, 4))
ax = sns.scatterplot(
    x='XOM', y='CVX',
    hue='cluster', style='cluster', 
    ax=ax, data=df)
ax.set_xlim(-3, 3)
ax.set_ylim(-3, 3)
centers.plot.scatter(
    x='XOM', y='CVX',
    ax=ax, s=50,
    color='black')
plt.tight_layout()
plt.show()

8.3.2 - K-Means Algorithm

The _Scikit-Learn_ algorithm is repeated 10 times by default (`n_init`), `max_iter` is used to control the number of iterations.

syms = sorted(['AAPL', 'MSFT', 'CSCO', 'INTC', 'CVX', 'XOM', 'SLB', 'COP', 
               'JPM', 'WFC', 'USB', 'AXP', 'WMT', 'TGT', 'HD', 'COST'])
top_sp = sp500_px.loc[sp500_px.index >= '2011-01-01', syms]
kmeans = KMeans(n_clusters=5).fit(top_sp)

### Interpreting the Clusters

from collections import Counter
print(Counter(kmeans.labels_))

Counter({0: 302, 4: 292, 3: 250, 1: 182, 2: 105})

centers = pd.DataFrame(kmeans.cluster_centers_, columns=syms)
f, axes = plt.subplots(5, 1, figsize=(5, 6), sharex=True)
for i, ax in enumerate(axes):
    center = centers.loc[i, :]
    maxPC = 1.01 * np.max(np.max(np.abs(center)))
    colors = ['C0' if l > 0 else 'C1' for l in center]
    ax.axhline(color='#888888')
    center.plot.bar(ax=ax, color=colors)
    ax.set_ylabel(f'Cluster {i + 1}')
    ax.set_ylim(-maxPC, maxPC)
plt.tight_layout()
plt.show()

8.3.3 - Selecting the Number of Clusters

inertia = []
for n_clusters in range(2, 15):
    kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(top_sp)
    inertia.append(kmeans.inertia_ / n_clusters)
inertias = pd.DataFrame({'n_clusters': range(2, 15), 'inertia': inertia})
ax = inertias.plot(x='n_clusters', y='inertia')
plt.xlabel('Number of clusters(k)')
plt.ylabel('Average Within-Cluster Squared Distances')
plt.ylim((0, 1.1 * inertias.inertia.max()))
ax.legend().set_visible(False)
plt.tight_layout()
plt.show()

8.4 - Hierarchical Clustering

8.4.1 - A Simple Example

syms1 = ['AAPL', 'AMZN', 'AXP', 'COP', 'COST', 'CSCO', 'CVX', 'GOOGL', 'HD', 
         'INTC', 'JPM', 'MSFT', 'SLB', 'TGT', 'USB', 'WFC', 'WMT', 'XOM']
df = sp500_px.loc[sp500_px.index >= '2011-01-01', syms1].transpose()
Z = linkage(df, method='complete')
print(Z.shape)

(17, 4)

8.4.2 - The Dendrogram

fig, ax = plt.subplots(figsize=(5, 5))
dendrogram(Z, labels=list(df.index), color_threshold=0)
plt.xticks(rotation=90)
ax.set_ylabel('distance')

plt.tight_layout()
plt.show()

memb = fcluster(Z, 4, criterion='maxclust')
memb = pd.Series(memb, index=df.index)
for key, item in memb.groupby(memb):
    print(f"{key} : {', '.join(item.index)}")

1 : COP, CVX, SLB, XOM
2 : AAPL, AXP, COST, CSCO, HD, INTC, JPM, MSFT, TGT, USB, WFC, WMT
3 : AMZN
4 : GOOGL

8.4.3 - Medidas de Dissimilaridade

df = sp500_px.loc[sp500_px.index >= '2011-01-01', ['XOM', 'CVX']]
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(5, 5))
for i, method in enumerate(['single', 'average', 'complete', 'ward']):
    ax = axes[i // 2, i % 2]
    Z = linkage(df, method=method)
    colors = [f'C{c+1}' for c in fcluster(Z, 4, criterion='maxclust')]
    ax = sns.scatterplot(x='XOM', y='CVX', hue=colors, style=colors,
                         size=0.5, ax=ax, data=df, legend=False)

    ax.set_xlim(-3, 3)
    ax.set_ylim(-3, 3)
    ax.set_title(method)
plt.tight_layout()
plt.show()

8.5 - Agrupamento baseado em modelo

8.5.1 - Distribuição Normal Multivariada

Defina um mapa de cores que corresponda aos níveis de probabilidade

mean = [0.5, -0.5]
cov = [[1, 1], [1, 2]]
probability = [.5, .75, .95, .99]
#
def probLevel(p):
    D = 1
    return (1 - p) / (2 * math.pi * D)
#
levels = [probLevel(p) for p in probability]
#
fig, ax = plt.subplots(figsize=(5, 5))
#
x, y = np.mgrid[-2.8:3.8:.01, -5:4:.01]
pos = np.empty(x.shape + (2,))
pos[:, :, 0] = x; pos[:, :, 1] = y
rv = multivariate_normal(mean, cov)
#
CS = ax.contourf(x, y, rv.pdf(pos), cmap=cm.GnBu, levels=50)
ax.contour(CS, levels=levels, colors=['black'])
ax.plot(*mean, color='black', marker='o')
#
plt.tight_layout()
plt.show()

8.5.2 - Mixtures of Normals

df = sp500_px.loc[sp500_px.index >= '2011-01-01', ['XOM', 'CVX']]
mclust = GaussianMixture(n_components=2).fit(df)
print(mclust.bic(df))

4589.3203170532615

fig, ax = plt.subplots(figsize=(4, 4))
colors = [f'C{c}' for c in mclust.predict(df)]
df.plot.scatter(x='XOM', y='CVX', c=colors, alpha=0.5, ax=ax)
ax.set_xlim(-3, 3)
ax.set_ylim(-3, 3)
plt.tight_layout()
plt.show()

print('Mean')
print(mclust.means_)
print('Covariances')
print(mclust.covariances_)

Mean
[[ 0.0709161   0.10228006]
 [-0.05074943 -0.21539465]]
Covariances
[[[0.27074158 0.27800893]
  [0.27800893 0.5212865 ]]

 [[0.98668327 0.99350046]
  [0.99350046 1.69695919]]]

8.5.3 - Selecionando o número de clusters

results = []
covariance_types = ['full', 'tied', 'diag', 'spherical']
for n_components in range(1, 9):
    for covariance_type in covariance_types:
        mclust = GaussianMixture(n_components = n_components, warm_start=True,
                                 covariance_type = covariance_type)
        mclust.fit(df)
        results.append({
            'bic': mclust.bic(df),
            'n_components': n_components,
            'covariance_type': covariance_type,
        })
#
results = pd.DataFrame(results)
#
colors = ['C0', 'C1', 'C2', 'C3']
styles = ['C0-','C1:','C0-.', 'C1--']
#
fig, ax = plt.subplots(figsize=(4, 4))
for i, covariance_type in enumerate(covariance_types):
    subset = results.loc[results.covariance_type == covariance_type, :]
    subset.plot(x='n_components', y='bic', ax=ax, label=covariance_type, 
                kind='line', style=styles[i]) # , color=colors[i])
#
plt.tight_layout()
plt.show()

8.6 - Escala e Variáveis Categóricas

8.6.1 - Escalando as variáveis

loan_data = pd.read_csv(LOAN_DATA_CSV)
loan_data['outcome'] = pd.Categorical(loan_data['outcome'], 
                                      categories=['paid off', 'default'], 
                                      ordered=True)
defaults = loan_data.loc[loan_data['outcome'] == 'default',]

columns = ['loan_amnt', 'annual_inc', 'revol_bal', 'open_acc', 
           'dti', 'revol_util']

df = defaults[columns]
kmeans = KMeans(n_clusters=4, random_state=1).fit(df)
counts = Counter(kmeans.labels_)

centers = pd.DataFrame(kmeans.cluster_centers_, columns=columns)
centers['size'] = [counts[i] for i in range(4)]
print(centers)

scaler = preprocessing.StandardScaler()
df0 = scaler.fit_transform(df * 1.0)

kmeans = KMeans(n_clusters=4, random_state=1).fit(df0)
counts = Counter(kmeans.labels_)

centers = pd.DataFrame(scaler.inverse_transform(kmeans.cluster_centers_), 
                       columns=columns)
centers['size'] = [counts[i] for i in range(4)]
print(centers)

      loan_amnt     annual_inc     revol_bal   open_acc        dti  revol_util   size
0  18275.132345   83354.634595  19635.189254  11.664373  16.774586   62.258588   7543
1  21852.701005  165407.730318  38907.295645  12.597152  13.466876   63.634900   1194
2  10591.893792   42453.058692  10268.048598   9.583820  17.713563   58.111226  13882
3  22570.192308  489783.403846  85161.346154  13.326923   6.907500   59.651923     52
      loan_amnt     annual_inc     revol_bal   open_acc        dti  revol_util  size
0  10499.824632   51070.958451  11629.172535   7.511129  15.965747   77.806693  7405
1  10315.255666   53468.181307   6032.616033   8.637385  11.255855   31.000342  5339
2  25920.260952  116308.326663  32827.641428  12.389941  16.204021   66.172004  3701
3  13420.700048   55844.852918  16370.832021  14.334512  24.189881   59.227862  6226

8.6.2 - Variáveis Dominantes

syms = ['GOOGL', 'AMZN', 'AAPL', 'MSFT', 'CSCO', 'INTC', 'CVX', 'XOM', 
        'SLB', 'COP', 'JPM', 'WFC', 'USB', 'AXP', 'WMT', 'TGT', 'HD', 'COST']
top_sp1 = sp500_px.loc[sp500_px.index >= '2005-01-01', syms]

sp_pca1 = PCA()
sp_pca1.fit(top_sp1)

explained_variance = pd.DataFrame(sp_pca1.explained_variance_)
ax = explained_variance.head(10).plot.bar(legend=False, figsize=(4, 4))
ax.set_xlabel('Component')

plt.tight_layout()
plt.show()

loadings = pd.DataFrame(
    sp_pca1.components_[0:2, :], 
    columns=top_sp1.columns)
print(loadings.transpose())

              0         1
GOOGL -0.857310  0.477873
AMZN  -0.444728 -0.874149
AAPL  -0.071627 -0.020802
MSFT  -0.036002 -0.006204
CSCO  -0.029205 -0.003045
INTC  -0.026666 -0.006069
CVX   -0.089548 -0.037420
XOM   -0.080336 -0.020511
SLB   -0.110218 -0.030356
COP   -0.057739 -0.024117
JPM   -0.071228 -0.009244
WFC   -0.053228 -0.008597
USB   -0.041670 -0.005952
AXP   -0.078907 -0.024027
WMT   -0.040346 -0.007141
TGT   -0.063659 -0.024662
HD    -0.051412 -0.032922
COST  -0.071403 -0.033826

8.6.3 - Problemas com o agrupamento de dados mistos

columns = ['dti', 'payment_inc_ratio', 'home_', 'pub_rec_zero']
df = pd.get_dummies(defaults[columns])

scaler = preprocessing.StandardScaler()

df0 = scaler.fit_transform(df * 1.0)
kmeans = KMeans(n_clusters=4, random_state=1).fit(df0)
centers = pd.DataFrame(scaler.inverse_transform(kmeans.cluster_centers_), 
                       columns=df.columns)
print(centers)

         dti  payment_inc_ratio  pub_rec_zero  home__MORTGAGE     home__OWN    home__RENT
0  16.992128           9.105395      1.000000    1.176836e-14 -1.346145e-15  1.000000e+00
1  17.456244           8.422914      1.000000    1.000000e+00 -1.193490e-15 -2.275957e-15
2  16.504955           8.064247      0.000000    5.156600e-01  9.714451e-17  4.843400e-01
3  17.197993           9.266666      0.917903   -6.661338e-16  1.000000e+00  1.998401e-15

Arduino

Coautor

Betobyte

Autor

Autores

||| Áreas ||| Estatística ||| Python ||| Projetos ||| Dicas & Truques ||| Quantum ||| Estatística para Cientistas de Dados || Python para Iniciantes || Python Básico || Matplotlib || Numpy || Seaborn || Pandas || Django || Estatística para Cientistas de Dados || Python com ML Básico || Python com ML Básico || Aulas | 1 (Introdução) | 2 (Análise de dados exploratória) | 3 (Dados e exemplos de distribuições) | 4 (Experimentos estatísticos e testes de significância) | 5 (Regressão e previsão) | 6 (Regressão e previsão) | 7 (Aprendizado de máquina estatístico) | 8 (Aprendizado não supervisionado) |