Modelos de estatística básica podem ser rodados de forma muito convencional (por exemplo, como o R faz) usando o pacote Statsmodels conforme já foi mencionado aqui nessa resposta.
Enquanto o foco do capítulo 2 e capítulo 3 são respectivamente regressão linear simples e regressão linear múltipla, o foco desse capítulo é teste de hipóteses. Como você pode ver nos capítulos anteriores, o sumário da regressão já apresenta os testes de hipótese convencionais: testes \(t\) para coeficientes individuais nulos e testes \(F\) para todos os coeficientes nulos. Entretanto, nesse capítulo o Wooldridge especifica alguns testes adicionais como testar se apenas um subconjunto dos coeficientes é nulo, testes unicaudais etc. Então a implementação desse capítulo vai apresentar algumas poucas diferenças.
Exemplo 4.1: [Uma diferença interessante desse exemplo é a implementação de testes monocaudais]
import numpy as np
import statsmodels.api as sm
import matplotlib as plt
import pandas as pd
import patsy as ps
import scipy
# Ex. 4.1
def tTestOneTail(coefficient,referenceCoefficient,results,significanceLevel,oneCaudalTestType):
coefficientValue=results.params[coefficient]
standardErrorEstimator=results.bse[coefficient]
degreesOfFreedom=results.df_resid
significanceLevel=0.05
tstat=(coefficientValue-referenceCoefficient)/standardErrorEstimator
if(oneCaudalTestType=='smaller'):
pvalue=scipy.stats.t.cdf(tstat,degreesOfFreedom)
else:
pvalue=1-scipy.stats.t.cdf(tstat,degreesOfFreedom)
conf1=coefficientValue-scipy.stats.t.ppf(1.0-significanceLevel, degreesOfFreedom)*standardErrorEstimator
conf2=coefficientValue+scipy.stats.t.ppf(1.0-significanceLevel, degreesOfFreedom)*standardErrorEstimator
print "=================================================================="
print "coef"+" "+"std err"+" "+" t "+" "+"pvalue"+" "+"[95% conf. int]"
print "{0:.4f}".format(coefficientValue)," ","{0:.4f}".format(standardErrorEstimator)," ","{0:.4f}".format(tstat)," ","{0:.4f}".format(pvalue)," ","{0:.4f}".format(conf1)," ","{0:.4f}".format(conf2)
if __name__ == '__main__':
df = pd.read_csv('/home/daniel/Documents/Projetos/Prorum/Python For Econometrics/DataSets/Txt/WAGE1.raw',delim_whitespace=True,header=None)
df.columns=['wage','educ','exper','tenure','nonwhite','female','married','numdep','smsa','northcen','south','west','construc','ndurman','trcommpu','trade',\
'services','profserv','profocc','clerocc','servocc','lwage','expersq','tenursq']
# Multiple regression
y,X = ps.dmatrices('np.log(wage) ~ educ + exper + tenure',data=df, return_type='dataframe')
model = sm.OLS(y,X) # Describe Model
results = model.fit() # Fit model
print results.summary()
# This result is already presented above. However, you can test a specific hypothesis using the
# code below
hypotheses = 'exper = 0.0'
t_test = results.t_test(hypotheses)
print(t_test)
# Alternative test
#H0: beta1=0 e H1: beta1>0
coefficient='educ'
referenceCoefficient=0
significantLevel=0.05
oneCaudalTestType='larger'
tTestOneTail(coefficient,referenceCoefficient,results,significanceLevel,oneCaudalTestType)
Exemplo 4.2:
import numpy as np
import statsmodels.api as sm
import matplotlib as plt
import pandas as pd
import patsy as ps
# Ex. 4.2
if __name__ == '__main__':
df = pd.read_csv('/home/daniel/Documents/Projetos/Prorum/Python For Econometrics/DataSets/Txt/MEAP93.raw',delim_whitespace=True,header=None)
df.columns=['lnchprg','enroll','staff','expend','salary','benefits','droprate','gradrate','math10','sci11','totcomp','ltotcomp','lexpend','lenroll','lstaff','bensal','lsalary']
# Multiple regression
y,X = ps.dmatrices('math10 ~ totcomp + staff + enroll',data=df, return_type='dataframe')
model = sm.OLS(y,X) # Describe Model
results = model.fit() # Fit model
print results.summary()
# This result is already presented above. However, you can test a specific hypothesis using the
# code below
hypotheses = 'enroll = 0.0'
t_test = results.t_test(hypotheses)
print(t_test)
Exemplo 4.3:
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import pandas as pd
import patsy as ps
# Examples 4.3
if __name__ == '__main__':
df = pd.read_csv('/home/daniel/Documents/Projetos/Prorum/Python For Econometrics/DataSets/Txt/GPA1.raw',delim_whitespace=True,header=None)
df.columns=['age','soph','junior','senior','senior5','male','campus','business','engineer','colGPA','hsGPA','ACT','job19','job20','drive','bike','walk','voluntr','PC','greek','car','siblings','bgfriend',
'clubs','skipped','alcohol','gradMI','fathcoll','mothcoll']
# Multiple Regression
y,X = ps.dmatrices('colGPA ~ hsGPA + ACT + skipped',data=df, return_type='dataframe')
model = sm.OLS(y,X) # Describe Model
results = model.fit() # Fit model
print results.summary()
# This result is already presented above. However, you can test a specific hypothesis using the
# code below
hypotheses = 'hsGPA = 0.0'
t_test = results.t_test(hypotheses)
print(t_test)
Exemplo 4.4:
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import pandas as pd
import patsy as ps
import scipy
# Example 4.4
def tTestOneTail(coefficient,referenceCoefficient,results,significanceLevel,oneCaudalTestType):
coefficientValue=results.params[coefficient]
standardErrorEstimator=results.bse[coefficient]
degreesOfFreedom=results.df_resid
significanceLevel=0.05
tstat=(coefficientValue-referenceCoefficient)/standardErrorEstimator
if(oneCaudalTestType=='smaller'):
pvalue=scipy.stats.t.cdf(tstat,degreesOfFreedom)
else:
pvalue=1-scipy.stats.t.cdf(tstat,degreesOfFreedom)
conf1=coefficientValue-scipy.stats.t.ppf(1.0-significanceLevel, degreesOfFreedom)*standardErrorEstimator
conf2=coefficientValue+scipy.stats.t.ppf(1.0-significanceLevel, degreesOfFreedom)*standardErrorEstimator
print "degreesOfFreedom",degreesOfFreedom
print "One sided 5% critical value",scipy.stats.t.ppf(1.0-significanceLevel, degreesOfFreedom)
print "=================================================================="
print "coef"+" "+"std err"+" "+" t "+" "+"pvalue"+" "+"[95% conf. int]"
print "{0:.4f}".format(coefficientValue)," ","{0:.4f}".format(standardErrorEstimator)," ","{0:.4f}".format(tstat)," ","{0:.4f}".format(pvalue)," ","{0:.4f}".format(conf1)," ","{0:.4f}".format(conf2)
if __name__ == '__main__':
df = pd.read_csv('/home/daniel/Documents/Projetos/Prorum/Python For Econometrics/DataSets/Txt/campus.raw',delim_whitespace=True,header=None)
df.columns=['enrol','priv','police','crime','lcrime','lenroll','lpolice']
# Multiple Regression
y,X = ps.dmatrices('np.log(crime) ~ np.log(enrol)',data=df, return_type='dataframe')
model = sm.OLS(y,X) # Describe Model
results = model.fit() # Fit model
print results.summary()
# This result is already presented above. However, you can test a specific hypothesis using the
# code below
#H0: beta1=0
hypotheses = 'np.log(enrol) = 0.0'
t_test = results.t_test(hypotheses)
print(t_test)
#H0: beta1=1
hypotheses = 'np.log(enrol) = 1.0'
t_test = results.t_test(hypotheses)
print(t_test)
#H0: beta1=1 e H1: beta1>1
coefficient='np.log(enrol)'
referenceCoefficient=1
significanceLevel=0.05
oneCaudalTestType='larger'
tTestOneTail(coefficient,referenceCoefficient,results,significanceLevel,oneCaudalTestType)
Exemplo 4.5:
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import pandas as pd
import patsy as ps
# Examples 4.5
if __name__ == '__main__':
df = pd.read_csv('/home/daniel/Documents/Projetos/Prorum/Python For Econometrics/DataSets/Txt/HPRICE2.raw',delim_whitespace=True,header=None)
df.columns=['price','crime','nox','rooms','dist','radial','proptax','stratio','lowstat','lprice','lnox','lproptax']
# Multiple Regression
y,X = ps.dmatrices('np.log(price) ~ np.log(nox) + np.log(dist) + rooms + stratio',data=df, return_type='dataframe')
model = sm.OLS(y,X) # Describe Model
results = model.fit() # Fit model
print results.summary()
# This result is already presented above. However, you can test a specific hypothesis using the
# code below
#H0: beta1=0
hypotheses = 'np.log(nox) = -1.0'
t_test = results.t_test(hypotheses)
print(t_test)
Início da Section 4.4: [Teste F]
import numpy as np
import statsmodels.api as sm
import matplotlib as plt
import pandas as pd
import patsy as ps
# Beginning of Section 4.4
if __name__ == '__main__':
df = pd.read_csv('/home/daniel/Documents/Projetos/Prorum/Python For Econometrics/DataSets/Txt/twoyear.raw',delim_whitespace=True,header=None)
df.columns=['female','phsrank','BA','AA','black','hispanic','id','exper','jc','univ','lwage','stotal','smcity','medcity','submed','lgcity',
'sublg','vlgcity','subvlg','ne','nc','south','totcoll']
#ps.NAAction(on_NA='drop', NA_types=['None', 'NaN'])
y,X = ps.dmatrices('lwage ~ jc + univ + exper',data=df, return_type='dataframe')
model = sm.OLS(y,X) # Describe Model
results = model.fit() # Fit model
print results.summary()
# H0: beta1-beta2=0
hypotheses = '(jc = univ)'
f_test = results.f_test(hypotheses)
print f_test
Início da Section 4.5:
import numpy as np
import statsmodels.api as sm
import matplotlib as plt
import pandas as pd
import patsy as ps
# Beginning of Section 4.5
if __name__ == '__main__':
df = pd.read_csv('/home/daniel/Documents/Projetos/Prorum/Python For Econometrics/DataSets/Txt/MLB1.raw',delim_whitespace=True,header=None)
df.columns=['salary','teamsal','nl','years','games','atbats','runs','hits','doubles','triples','hruns','rbis','bavg','bb','so','sbases',
'fldperc','frstbase','scndbase','shrtstop','thrdbase','outfield','catcher',
'yrsallst','hispan','black','whitepop','blackpop','hisppop','pcinc','gamesyr',
'hrunsyr','atbatsyr','allstar','slugavg','rbisyr','sbasesyr','runsyr','percwhte',
'percblck','perchisp','blckpb','hispph','whtepw','blckph','hisppb','lsalary']
#ps.NAAction(on_NA='drop', NA_types=['None', 'NaN'])
y,X = ps.dmatrices('np.log(salary) ~ years + gamesyr + bavg + hrunsyr + rbisyr',data=df, return_type='dataframe')
model = sm.OLS(y,X) # Describe Model
results = model.fit() # Fit model
print results.summary()
# H0: beta1-beta2=0
hypotheses = '(bavg = 0),(hrunsyr=0),(rbisyr)'
f_test = results.f_test(hypotheses)
print f_test
Exemplo 4.6:
import numpy as np
import statsmodels.api as sm
import matplotlib as plt
import pandas as pd
import patsy as ps
# Ex. 4.6
if __name__ == '__main__':
df = pd.read_csv('/home/daniel/Documents/Projetos/Prorum/Python For Econometrics/DataSets/Txt/401K.raw',delim_whitespace=True,header=None)
df.columns=['prate','mrate','totpart','totelg','age','totemp','sole','ltotemp']
y,X = ps.dmatrices('prate ~ mrate + age',data=df, return_type='dataframe')
model = sm.OLS(y,X) # Describe Model
results = model.fit() # Fit model
print results.summary()
Exemplo 4.7:
import numpy as np
import statsmodels.api as sm
import matplotlib as plt
import pandas as pd
import patsy as ps
# Ex. 4.7
if __name__ == '__main__':
df = pd.read_csv('/home/daniel/Documents/Projetos/Prorum/Python For Econometrics/DataSets/Txt/JTRAIN.raw',delim_whitespace=True,header=None,na_values=".")
df.columns=['year','fcode','employ','sales','avgsal','scrap','rework','tothrs',
'union','grant','d89','d88','totrain','hrsemp','lscrap','lemploy',
'lsales','lrework','lhrsemp','lscrap_1','grant_1','clscrap','cgrant',
'clemploy','clsales','lavgsal','clavgsal','cgrant_1','chrsemp','clhrsemp']
ps.NAAction(on_NA='drop', NA_types=['None', 'NaN'])
y,X = ps.dmatrices('np.log(scrap) ~ hrsemp + np.log(sales) + np.log(employ)',data=df, return_type='dataframe')
model = sm.OLS(y,X) # Describe Model
results = model.fit() # Fit model
print results.summary()
Exemplo 4.8:
import numpy as np
import statsmodels.api as sm
import matplotlib as plt
import pandas as pd
import patsy as ps
# Ex. 4.8
if __name__ == '__main__':
df = pd.read_csv('/home/daniel/Documents/Projetos/Prorum/Python For Econometrics/DataSets/Txt/RDCHEM.raw',delim_whitespace=True,header=None)
df.columns=['rd','sales','profits','rdintens','profmarg','salessq','lsales','lrd']
#ps.NAAction(on_NA='drop', NA_types=['None', 'NaN'])
y,X = ps.dmatrices('np.log(rd) ~ np.log(sales) + profmarg',data=df, return_type='dataframe')
model = sm.OLS(y,X) # Describe Model
results = model.fit() # Fit model
print results.summary()
Exemplo 4.9:
import numpy as np
import statsmodels.api as sm
import matplotlib as plt
import pandas as pd
import patsy as ps
# Ex. 4.9
if __name__ == '__main__':
df = pd.read_csv('/home/daniel/Documents/Projetos/Prorum/Python For Econometrics/DataSets/Txt/BWGHT.raw',delim_whitespace=True,header=None,na_values=".")
df.columns=['faminc','cigtax','cigprice','bwght','fatheduc','motheduc','parity','male','white','cigs','lbwght','bwghtlbs','packs','lfaminc']
ps.NAAction(on_NA='drop', NA_types=['None', 'NaN'])
y,X = ps.dmatrices('bwght ~ cigs + parity + faminc + motheduc + fatheduc',data=df, return_type='dataframe')
model = sm.OLS(y,X) # Describe Model
results = model.fit() # Fit model
print results.summary()
# H0: beta4=0 and beta5=0
hypotheses = '(motheduc = 0),(fatheduc=0)'
f_test = results.f_test(hypotheses)
print f_test