O código abaixo desenvolvido em Python explora a classificação usando OLS e os problemas causados por outliers. Note particularmente que com a presença de outliers a superfície discriminante começa a errar (como mostra a figura abaixo). Mais detalhes sobre o problema podem ser encontrados na seção 4.1.3 do livro Pattern Recognition and Machine Learning - Christopher Bishop

import matplotlib.pyplot as plt
import numpy as np
from sklearn import linear_model
np.random.seed(50)
fig=plt.figure(num=None, figsize=(8, 4), dpi=80, facecolor='w', edgecolor='k')
def plot_classificator(beta0,beta1,beta2,x1):
minx1=np.min(x1)
maxx1=np.max(x1)
minx2=(beta1*minx1)/(-beta2)
maxx2=(beta1*maxx1)/(-beta2)
plt.plot([minx1,maxx1],[minx2,maxx2],'k-')
def data_generation1(n,beta0,beta1,beta2):
x1=np.random.normal(0,0.64,n) # Primeiro regressor
x2=np.random.normal(0,0.36,n) # Segundo regressor
z=beta0*np.ones(n)+beta1*x1+beta2*x2 # Variavel latente
y=np.empty([n])
for i in range(n):
if(z[i]>beta0): # In order to ensure that in average each class has the same number of points
y[i]=1
else:
y[i]=0
return y,x1,x2
def data_generation2(n,beta0,beta1,beta2,x1,x2):
x1[95]=2.29
x2[95]=-0.4
x1[96]=2.3
x2[96]=-0.5
x1[97]=2.35
x2[97]=-0.45
x1[98]=2.38
x2[98]=-0.55
x1[99]=2.4
x2[99]=-0.6
z=beta0*np.ones(n)+beta1*x1+beta2*x2 # Variavel latente
y=np.empty([n])
for i in range(n):
if(z[i]>beta0): # In order to ensure that in average each class has the same number of points
y[i]=1
else:
y[i]=0
return y,x1,x2
def ols_estimator(x,y):
linModel= linear_model.LinearRegression(fit_intercept=False)
linModel.fit(x,y)
return linModel.coef_
if __name__ == '__main__':
n=100 # numero de observacoes
beta0=2
beta1=-1.5
beta2=-1
# Case 1
ax=plt.subplot(121)
[y,x1,x2]=data_generation1(n,beta0,beta1,beta2)
for i in range(n):
if(y[i]==1):
plt.plot(x1[i],x2[i],u'ro',markersize=5, markeredgewidth=0)
else:
plt.plot(x1[i],x2[i],u'bo',markersize=5, markeredgewidth=0)
ax.set_xlabel('$x_1$')
ax.set_ylabel('$x_2$')
ax.set_title('No outliers')
x=np.stack((np.ones(len(x1)).T, x1.T,x2.T),axis=1)
coefficients=ols_estimator(x,y)
print coefficients
hatBeta0=coefficients[0]
hatBeta1=coefficients[1]
hatBeta2=coefficients[2]
plot_classificator(hatBeta0,hatBeta1,hatBeta2,x1)
# Case 2
ax=plt.subplot(122)
[y,x1,x2]=data_generation2(n,beta0,beta1,beta2,x1,x2)
for i in range(n):
if(y[i]==1):
plt.plot(x1[i],x2[i],u'ro',markersize=5, markeredgewidth=0)
else:
plt.plot(x1[i],x2[i],u'bo',markersize=5, markeredgewidth=0)
ax.set_xlabel('$x_1$')
ax.set_ylabel('$x_2$')
ax.set_title('With outliers')
x=np.stack((np.ones(len(x1)).T, x1.T,x2.T),axis=1)
coefficients=ols_estimator(x,y)
print coefficients
hatBeta0=coefficients[0]
hatBeta1=coefficients[1]
hatBeta2=coefficients[2]
plot_classificator(hatBeta0,hatBeta1,hatBeta2,x1)
fig.savefig("ols.eps")