import math
from pathlib import Path
import pandas as pd
import numpy as np
#
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
#
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from scipy.stats import multivariate_normal
#
import prince
#
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.colors import from_levels_and_colors
import seaborn as sns
O diretório DATA contém os arquivos .csv utilizados nos exemplos.
DATA = './'
Se você não mantiver seus dados no mesmo diretório que o código, adapte os nomes dos caminhos.
SP500_DATA_CSV = DATA + 'sp500_data.csv.gz'
SP500_SECTORS_CSV = DATA + 'sp500_sectors.csv'
LOAN_DATA_CSV = DATA + 'loan_data.csv.gz'
HOUSE_TASKS_CSV = DATA + 'housetasks.csv'
sp500_px = pd.read_csv(SP500_DATA_CSV, index_col=0)
oil_px = sp500_px[['XOM', 'CVX']]
print(oil_px.head())
#
pcs = PCA(n_components=2)
pcs.fit(oil_px)
loadings = pd.DataFrame(pcs.components_, columns=oil_px.columns)
print(loadings)
def abline(slope, intercept, ax):
"""Calculate coordinates of a line based on slope and intercept"""
x_vals = np.array(ax.get_xlim())
return (x_vals, intercept + slope * x_vals)
ax = oil_px.plot.scatter(x='XOM', y='CVX', alpha=0.3, figsize=(4, 4))
ax.set_xlim(-3, 3)
ax.set_ylim(-3, 3)
ax.plot(*abline(loadings.loc[0, 'CVX'] / loadings.loc[0, 'XOM'], 0, ax),
'--', color='C1')
ax.plot(*abline(loadings.loc[1, 'CVX'] / loadings.loc[1, 'XOM'], 0, ax),
'--', color='C1')
plt.tight_layout()
plt.show()
syms = sorted(['AAPL', 'MSFT', 'CSCO', 'INTC', 'CVX', 'XOM', 'SLB', 'COP',
'JPM', 'WFC', 'USB', 'AXP', 'WMT', 'TGT', 'HD', 'COST'])
top_sp = sp500_px.loc[sp500_px.index >= '2011-01-01', syms]
sp_pca = PCA()
sp_pca.fit(top_sp)
explained_variance = pd.DataFrame(sp_pca.explained_variance_)
ax = explained_variance.head(10).plot.bar(legend=False, figsize=(4, 4))
ax.set_xlabel('Component')
plt.tight_layout()
plt.show()
loadings = pd.DataFrame(sp_pca.components_[0:5, :],
columns=top_sp.columns)
print(loadings)
maxPC = 1.01 * loadings.loc[0:5, :].abs().to_numpy().max()
f, axes = plt.subplots(5, 1, figsize=(5, 5), sharex=True)
for i, ax in enumerate(axes):
pc_loadings = loadings.loc[i, :]
colors = ['C0' if l > 0 else 'C1' for l in pc_loadings]
ax.axhline(color='#888888')
pc_loadings.plot.bar(ax=ax, color=colors)
ax.set_ylabel(f'PC{i+1}')
ax.set_ylim(-maxPC, maxPC)
plt.tight_layout()
plt.show()
housetasks = pd.read_csv(HOUSE_TASKS_CSV, index_col=0)
ca = prince.CA(n_components=2)
ca = ca.fit(housetasks)
ca.plot_coordinates(housetasks, figsize=(6, 6))
plt.tight_layout()
plt.show()
df = sp500_px.loc[sp500_px.index >= '2011-01-01', ['XOM', 'CVX']]
kmeans = KMeans(n_clusters=4).fit(df)
df['cluster'] = kmeans.labels_
print(df.head())
centers = pd.DataFrame(kmeans.cluster_centers_, columns=['XOM', 'CVX'])
print(centers)
fig, ax = plt.subplots(figsize=(4, 4))
ax = sns.scatterplot(
x='XOM', y='CVX',
hue='cluster', style='cluster',
ax=ax, data=df)
ax.set_xlim(-3, 3)
ax.set_ylim(-3, 3)
centers.plot.scatter(
x='XOM', y='CVX',
ax=ax, s=50,
color='black')
plt.tight_layout()
plt.show()
The _Scikit-Learn_ algorithm is repeated 10 times by default (`n_init`), `max_iter` is used to control the number of iterations.
syms = sorted(['AAPL', 'MSFT', 'CSCO', 'INTC', 'CVX', 'XOM', 'SLB', 'COP',
'JPM', 'WFC', 'USB', 'AXP', 'WMT', 'TGT', 'HD', 'COST'])
top_sp = sp500_px.loc[sp500_px.index >= '2011-01-01', syms]
kmeans = KMeans(n_clusters=5).fit(top_sp)
### Interpreting the Clusters
from collections import Counter
print(Counter(kmeans.labels_))
centers = pd.DataFrame(kmeans.cluster_centers_, columns=syms)
f, axes = plt.subplots(5, 1, figsize=(5, 6), sharex=True)
for i, ax in enumerate(axes):
center = centers.loc[i, :]
maxPC = 1.01 * np.max(np.max(np.abs(center)))
colors = ['C0' if l > 0 else 'C1' for l in center]
ax.axhline(color='#888888')
center.plot.bar(ax=ax, color=colors)
ax.set_ylabel(f'Cluster {i + 1}')
ax.set_ylim(-maxPC, maxPC)
plt.tight_layout()
plt.show()
inertia = []
for n_clusters in range(2, 15):
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(top_sp)
inertia.append(kmeans.inertia_ / n_clusters)
inertias = pd.DataFrame({'n_clusters': range(2, 15), 'inertia': inertia})
ax = inertias.plot(x='n_clusters', y='inertia')
plt.xlabel('Number of clusters(k)')
plt.ylabel('Average Within-Cluster Squared Distances')
plt.ylim((0, 1.1 * inertias.inertia.max()))
ax.legend().set_visible(False)
plt.tight_layout()
plt.show()
syms1 = ['AAPL', 'AMZN', 'AXP', 'COP', 'COST', 'CSCO', 'CVX', 'GOOGL', 'HD',
'INTC', 'JPM', 'MSFT', 'SLB', 'TGT', 'USB', 'WFC', 'WMT', 'XOM']
df = sp500_px.loc[sp500_px.index >= '2011-01-01', syms1].transpose()
Z = linkage(df, method='complete')
print(Z.shape)
fig, ax = plt.subplots(figsize=(5, 5))
dendrogram(Z, labels=list(df.index), color_threshold=0)
plt.xticks(rotation=90)
ax.set_ylabel('distance')
plt.tight_layout()
plt.show()
memb = fcluster(Z, 4, criterion='maxclust')
memb = pd.Series(memb, index=df.index)
for key, item in memb.groupby(memb):
print(f"{key} : {', '.join(item.index)}")
df = sp500_px.loc[sp500_px.index >= '2011-01-01', ['XOM', 'CVX']]
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(5, 5))
for i, method in enumerate(['single', 'average', 'complete', 'ward']):
ax = axes[i // 2, i % 2]
Z = linkage(df, method=method)
colors = [f'C{c+1}' for c in fcluster(Z, 4, criterion='maxclust')]
ax = sns.scatterplot(x='XOM', y='CVX', hue=colors, style=colors,
size=0.5, ax=ax, data=df, legend=False)
ax.set_xlim(-3, 3)
ax.set_ylim(-3, 3)
ax.set_title(method)
plt.tight_layout()
plt.show()
Defina um mapa de cores que corresponda aos níveis de probabilidade
mean = [0.5, -0.5]
cov = [[1, 1], [1, 2]]
probability = [.5, .75, .95, .99]
#
def probLevel(p):
D = 1
return (1 - p) / (2 * math.pi * D)
#
levels = [probLevel(p) for p in probability]
#
fig, ax = plt.subplots(figsize=(5, 5))
#
x, y = np.mgrid[-2.8:3.8:.01, -5:4:.01]
pos = np.empty(x.shape + (2,))
pos[:, :, 0] = x; pos[:, :, 1] = y
rv = multivariate_normal(mean, cov)
#
CS = ax.contourf(x, y, rv.pdf(pos), cmap=cm.GnBu, levels=50)
ax.contour(CS, levels=levels, colors=['black'])
ax.plot(*mean, color='black', marker='o')
#
plt.tight_layout()
plt.show()
df = sp500_px.loc[sp500_px.index >= '2011-01-01', ['XOM', 'CVX']]
mclust = GaussianMixture(n_components=2).fit(df)
print(mclust.bic(df))
fig, ax = plt.subplots(figsize=(4, 4))
colors = [f'C{c}' for c in mclust.predict(df)]
df.plot.scatter(x='XOM', y='CVX', c=colors, alpha=0.5, ax=ax)
ax.set_xlim(-3, 3)
ax.set_ylim(-3, 3)
plt.tight_layout()
plt.show()
print('Mean')
print(mclust.means_)
print('Covariances')
print(mclust.covariances_)
results = []
covariance_types = ['full', 'tied', 'diag', 'spherical']
for n_components in range(1, 9):
for covariance_type in covariance_types:
mclust = GaussianMixture(n_components = n_components, warm_start=True,
covariance_type = covariance_type)
mclust.fit(df)
results.append({
'bic': mclust.bic(df),
'n_components': n_components,
'covariance_type': covariance_type,
})
#
results = pd.DataFrame(results)
#
colors = ['C0', 'C1', 'C2', 'C3']
styles = ['C0-','C1:','C0-.', 'C1--']
#
fig, ax = plt.subplots(figsize=(4, 4))
for i, covariance_type in enumerate(covariance_types):
subset = results.loc[results.covariance_type == covariance_type, :]
subset.plot(x='n_components', y='bic', ax=ax, label=covariance_type,
kind='line', style=styles[i]) # , color=colors[i])
#
plt.tight_layout()
plt.show()
loan_data = pd.read_csv(LOAN_DATA_CSV)
loan_data['outcome'] = pd.Categorical(loan_data['outcome'],
categories=['paid off', 'default'],
ordered=True)
defaults = loan_data.loc[loan_data['outcome'] == 'default',]
columns = ['loan_amnt', 'annual_inc', 'revol_bal', 'open_acc',
'dti', 'revol_util']
df = defaults[columns]
kmeans = KMeans(n_clusters=4, random_state=1).fit(df)
counts = Counter(kmeans.labels_)
centers = pd.DataFrame(kmeans.cluster_centers_, columns=columns)
centers['size'] = [counts[i] for i in range(4)]
print(centers)
scaler = preprocessing.StandardScaler()
df0 = scaler.fit_transform(df * 1.0)
kmeans = KMeans(n_clusters=4, random_state=1).fit(df0)
counts = Counter(kmeans.labels_)
centers = pd.DataFrame(scaler.inverse_transform(kmeans.cluster_centers_),
columns=columns)
centers['size'] = [counts[i] for i in range(4)]
print(centers)
syms = ['GOOGL', 'AMZN', 'AAPL', 'MSFT', 'CSCO', 'INTC', 'CVX', 'XOM',
'SLB', 'COP', 'JPM', 'WFC', 'USB', 'AXP', 'WMT', 'TGT', 'HD', 'COST']
top_sp1 = sp500_px.loc[sp500_px.index >= '2005-01-01', syms]
sp_pca1 = PCA()
sp_pca1.fit(top_sp1)
explained_variance = pd.DataFrame(sp_pca1.explained_variance_)
ax = explained_variance.head(10).plot.bar(legend=False, figsize=(4, 4))
ax.set_xlabel('Component')
plt.tight_layout()
plt.show()
loadings = pd.DataFrame(
sp_pca1.components_[0:2, :],
columns=top_sp1.columns)
print(loadings.transpose())
columns = ['dti', 'payment_inc_ratio', 'home_', 'pub_rec_zero']
df = pd.get_dummies(defaults[columns])
scaler = preprocessing.StandardScaler()
df0 = scaler.fit_transform(df * 1.0)
kmeans = KMeans(n_clusters=4, random_state=1).fit(df0)
centers = pd.DataFrame(scaler.inverse_transform(kmeans.cluster_centers_),
columns=df.columns)
print(centers)