Referências
[ 1 ] Izenman, Alan Julian. "Modern multivariate statistical techniques". Springer, 2008.
[ 2 ] Bishop, Christopher M. "Pattern Recognition and Machine Learning". Springer, 2006.
[ 3 ] Fumio, Hayashi. "Econometrics". Princeton University Press, 2000.
Anexo: Código-fonte completo
import numpy as np
import matplotlib.pyplot as plt
# ****************************************************************************
# Leitura dos dados
# ****************************************************************************
import pandas as pd
import copy
detergent_df = pd.read_table('detergentyx.txt')
columns_x = list()
for column in detergent_df.columns:
if('x' in column):
columns_x.append(column)
X_df = copy.deepcopy(detergent_df[columns_x])
N, K = X_df.shape
# ****************************************************************************
# Plotando os espectros
# ****************************************************************************
fig, ax = plt.subplots()
ax.plot(np.linspace(1, K, K), X_df.transpose())
ax.set_xlim(1, K)
plt.xlabel("Frequencia")
plt.ylabel("Absorbancia")
plt.show()
# ****************************************************************************
# Calculando o posto(X)
# ****************************************************************************
r = np.linalg.matrix_rank(X_df)
print "Posto(X) = ", r
print ""
# ****************************************************************************
# Mostrando e plotando os autovalores e a metrica
# ****************************************************************************
U, s, V = np.linalg.svd(X_df, full_matrices=False)
autovalores = np.sqrt(s[0:r])
figure_handle = 1
fitness_measure = np.zeros(len(autovalores))
print " "*17, "medida de\n", " i autovalor ajuste"
for i in range(len(autovalores)):
fitness_measure[i] = np.sum(autovalores[(i+1):len(autovalores)]) / np.sum(autovalores)
print "{:2d} {:12.8f} {:12.8f}".format(i+1, autovalores[i], fitness_measure[i])
import matplotlib.pyplot as plt
plt.figure(figure_handle)
plt.clf()
plt.plot(range(1, len(autovalores)+1), autovalores, 'o:')
# plt.axes().set_yscale('log')
plt.axes().set_ylim(0, 1.01*np.max(autovalores))
plt.axes().set_xlim(1, r)
plt.xlabel('i')
plt.ylabel(r'$\lambda_i$')
plt.figure(figure_handle+1)
plt.clf()
plt.plot(range(1, len(fitness_measure)+1), fitness_measure, 'o:')
# plt.axes().set_ylim(0, 1.01*np.max(autovalores))
plt.axes().set_xlim(1, r)
plt.xlabel('t')
plt.ylabel(r'$\frac{\lambda_{t+1} + \cdots \lambda_{K}}{\lambda_{1} + \cdots \lambda_{K}}$')
# ****************************************************************************
# Calculando para t = 8
# ****************************************************************************
print "***********************************************************************"
print "Resultado para t = 8"
print "***********************************************************************"
t = 8
U, s, V = np.linalg.svd(X_df, full_matrices=False)
Lambda = np.diag(np.sqrt(s[0:t]))
V = V.transpose()[:,0:t]
Zt = np.matmul(X_df, V)
for saida in range(4):
ystr = 'y'+str(saida+1)
y = detergent_df[ystr]
beta_pcr = np.matmul(np.matmul(np.linalg.inv(np.matmul(Zt.transpose(), Zt)), Zt.transpose()), y)
beta_pcr_estrela = np.matmul(V[:,0:t], beta_pcr[0:t])
result_df = pd.DataFrame()
result_df[ystr+'_hat'] = np.matmul(Zt, beta_pcr)
result_df[ystr+'*_hat'] = np.matmul(X_df, beta_pcr_estrela)
erro_pcr = y - result_df[ystr+'_hat']
result_df[ystr] = y
result_df[ystr] = y
R2 = np.dot(result_df[ystr+'_hat'], result_df[ystr+'_hat']) / np.dot(y, y)
MSE = np.sum(np.square(erro_pcr))/(float(N)-1.0)
print "Saida: ", ystr
print result_df
print ""
print "R2 = ", R2
print "MSE = ", MSE
# ****************************************************************************
# Calculando para uma lista de t
# Usando SVD
# http://docs.scipy.org/doc/numpy-1.10.0/reference/generated/numpy.linalg.svd.html
# ****************************************************************************
print "***********************************************************************"
print "Resultados para PCS usando numpy.linalg.svd"
print "***********************************************************************"
lista_t = list([1, 2, 4, 8, 12])
total_t = len(lista_t)
tabela_beta_pcr = dict()
tabela_beta_pcr_estrela = dict()
tabela_y_calculado_pcr = dict()
tabela_y_calculado_pcr_estrela = dict()
tabela_erro_pcr = dict()
tabela_erro_pcr_estrela = dict()
tabela_beta_R2 = dict()
tabela_beta_MSE = dict()
conferencia = 0.0
total_saidas = 4
for indice_t in range(total_t):
t = lista_t[indice_t]
U, s, V = np.linalg.svd(X_df, full_matrices=False)
Lambda = np.diag(np.sqrt(s[0:t]))
V = V.transpose()[:,0:t]
Zt = np.matmul(X_df, V)
for saida in range(total_saidas):
ystr = 'y'+str(saida+1)
y = detergent_df[ystr]
if (not tabela_beta_pcr.has_key(t)):
tabela_beta_pcr[t] = np.zeros((t, total_saidas))
tabela_beta_pcr_estrela[t] = np.zeros((K, total_saidas))
tabela_y_calculado_pcr[t] = np.zeros((N, total_saidas))
tabela_y_calculado_pcr_estrela[t] = np.zeros((N, total_saidas))
tabela_erro_pcr[t] = np.zeros((N, total_saidas))
tabela_erro_pcr_estrela[t] = np.zeros((N, total_saidas))
beta_pcr = np.matmul(np.matmul(np.linalg.inv(np.matmul(Zt.transpose(), Zt)), Zt.transpose()), y)
tabela_beta_pcr[t][:,saida] = beta_pcr
y_calculado_pcr = np.matmul(Zt, beta_pcr)
tabela_y_calculado_pcr[t][:,saida] = y_calculado_pcr
erro_pcr = y - y_calculado_pcr
tabela_erro_pcr[t][:,saida] = erro_pcr
beta_pcr_estrela = np.matmul(V[:,0:t], beta_pcr[0:t])
tabela_beta_pcr_estrela[t][:,saida] = beta_pcr_estrela
y_calculado_pcr_estrela = np.matmul(X_df, beta_pcr_estrela)
tabela_y_calculado_pcr_estrela[t][:,saida] = y_calculado_pcr_estrela
tabela_erro_pcr_estrela[t][:,saida] = y - y_calculado_pcr_estrela
R2 = np.dot(y_calculado_pcr, y_calculado_pcr) / np.dot(y, y)
tabela_beta_R2[(saida,t)] = R2
MSE = np.sum(np.square(erro_pcr))/(float(N)-1.0)
tabela_beta_MSE[(saida,t)] = MSE
conferencia += np.sum(np.square(y_calculado_pcr-y_calculado_pcr_estrela))
print "Conferencia se as duas saidas calculadas sao iguais = ", conferencia
for saida in range(total_saidas):
ystr = 'y'+str(saida+1)
y = detergent_df[ystr]
print "\n"
print "Saida: y%d" % (saida+1)
print ""
print " "*10, "y",
for indice_t in range(total_t):
t = lista_t[indice_t]
print " ycalc(t={:2d})".format(t),
print ""
for k in range(len(y)):
print "{:12.6f}".format(y[k]),
for indice_t in range(total_t):
t = lista_t[indice_t]
y_calculado_pcr = tabela_y_calculado_pcr[t]
print " {:12.6f}".format(y_calculado_pcr[k,saida]),
print ""
print ""
print " "*8, "R^2",
for indice_t in range(total_t):
t = lista_t[indice_t]
print " {:12.8f}".format(tabela_beta_R2[(saida,t)]),
print ""
print " "*8, "MSE",
for indice_t in range(total_t):
t = lista_t[indice_t]
print " {:12.8f}".format(tabela_beta_MSE[(saida,t)]),
print ""
print "\n"
# ****************************************************************************
# Plotando o grafico da quantidade de variancia explicada para cada componente
# ****************************************************************************
from sklearn import decomposition
pca = decomposition.PCA(n_components=r)
pca.fit(X_df)
plt.figure(3)
plt.clf()
plt.plot(pca.explained_variance_, linewidth=2)
plt.axis('tight')
plt.xlabel('Componente i')
plt.ylabel('Variancia explicada')
# ****************************************************************************
# Calculando para uma lista de t
# Usando a biblioteca sklearn
# http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
# ****************************************************************************
print "***********************************************************************"
print "Resultados para PCR usando sklearn.decomposition.PCA"
print "***********************************************************************"
from sklearn import decomposition
lista_t = list([1, 2, 4, 8, 12])
total_t = len(lista_t)
tabela_beta_pcr = dict()
tabela_beta_pcr_estrela = dict()
tabela_y_calculado_pcr = dict()
tabela_y_calculado_pcr_estrela = dict()
tabela_erro_pcr = dict()
tabela_erro_pcr_estrela = dict()
tabela_beta_R2 = dict()
tabela_beta_MSE = dict()
conferencia = 0.0
total_saidas = 4
for indice_t in range(total_t):
t = lista_t[indice_t]
pca = decomposition.PCA(n_components=t)
pca.fit(X_df)
Lambda = np.diag(pca.explained_variance_[0:t])
V = pca.components_.transpose()[:,0:t]
Zt = np.matmul(X_df, V)
for saida in range(total_saidas):
ystr = 'y'+str(saida+1)
y = detergent_df[ystr]
if (not tabela_beta_pcr.has_key(t)):
tabela_beta_pcr[t] = np.zeros((t, total_saidas))
tabela_beta_pcr_estrela[t] = np.zeros((K, total_saidas))
tabela_y_calculado_pcr[t] = np.zeros((N, total_saidas))
tabela_y_calculado_pcr_estrela[t] = np.zeros((N, total_saidas))
tabela_erro_pcr[t] = np.zeros((N, total_saidas))
tabela_erro_pcr_estrela[t] = np.zeros((N, total_saidas))
beta_pcr = np.matmul(np.matmul(np.linalg.inv(np.matmul(Zt.transpose(), Zt)), Zt.transpose()), y)
tabela_beta_pcr[t][:,saida] = beta_pcr
y_calculado_pcr = np.matmul(Zt, beta_pcr)
tabela_y_calculado_pcr[t][:,saida] = y_calculado_pcr
erro_pcr = y - y_calculado_pcr
tabela_erro_pcr[t][:,saida] = erro_pcr
beta_pcr_estrela = np.matmul(V[:,0:t], beta_pcr[0:t])
tabela_beta_pcr_estrela[t][:,saida] = beta_pcr_estrela
y_calculado_pcr_estrela = np.matmul(X_df, beta_pcr_estrela)
tabela_y_calculado_pcr_estrela[t][:,saida] = y_calculado_pcr_estrela
tabela_erro_pcr_estrela[t][:,saida] = y - y_calculado_pcr_estrela
R2 = np.dot(y_calculado_pcr, y_calculado_pcr) / np.dot(y, y)
tabela_beta_R2[(saida,t)] = R2
MSE = np.sum(np.square(erro_pcr))/(float(N)-1.0)
tabela_beta_MSE[(saida,t)] = MSE
conferencia += np.sum(np.square(y_calculado_pcr-y_calculado_pcr_estrela))
print "Conferencia se as duas saidas calculadas sao iguais = ", conferencia
for saida in range(total_saidas):
ystr = 'y'+str(saida+1)
y = detergent_df[ystr]
print "\n"
print "Saida: y%d" % (saida+1)
print ""
print " "*10, "y",
for indice_t in range(total_t):
t = lista_t[indice_t]
print " ycalc(t={:2d})".format(t),
print ""
for k in range(len(y)):
print "{:12.6f}".format(y[k]),
for indice_t in range(total_t):
t = lista_t[indice_t]
y_calculado_pcr = tabela_y_calculado_pcr[t]
print " {:12.6f}".format(y_calculado_pcr[k,saida]),
print ""
print ""
print " "*8, "R^2",
for indice_t in range(total_t):
t = lista_t[indice_t]
print " {:12.8f}".format(tabela_beta_R2[(saida,t)]),
print ""
print " "*8, "MSE",
for indice_t in range(total_t):
t = lista_t[indice_t]
print " {:12.8f}".format(tabela_beta_MSE[(saida,t)]),
print ""
print "\n"