Primeira vez aqui? Seja bem vindo e cheque o FAQ!
x

Introductory Econometrics - Jeffrey M. Wooldridge: Como replicar os exemplos do Capítulo 4 usando python?

+2 votos
154 visitas

1 Resposta

0 votos
respondida Fev 18, 2016 por danielcajueiro (5,251 pontos)  

Modelos de estatística básica podem ser rodados de forma muito convencional (por exemplo, como o R faz) usando o pacote Statsmodels conforme já foi mencionado aqui nessa resposta.

Enquanto o foco do capítulo 2 e capítulo 3 são respectivamente regressão linear simples e regressão linear múltipla, o foco desse capítulo é teste de hipóteses. Como você pode ver nos capítulos anteriores, o sumário da regressão já apresenta os testes de hipótese convencionais: testes \(t\) para coeficientes individuais nulos e testes \(F\) para todos os coeficientes nulos. Entretanto, nesse capítulo o Wooldridge especifica alguns testes adicionais como testar se apenas um subconjunto dos coeficientes é nulo, testes unicaudais etc. Então a implementação desse capítulo vai apresentar algumas poucas diferenças.

Exemplo 4.1: [Uma diferença interessante desse exemplo é a implementação de testes monocaudais]

import numpy as np
import statsmodels.api as sm
import matplotlib as plt
import pandas as pd
import patsy as ps
import scipy

# Ex. 4.1

def tTestOneTail(coefficient,referenceCoefficient,results,significanceLevel,oneCaudalTestType):
    coefficientValue=results.params[coefficient]

    standardErrorEstimator=results.bse[coefficient]
    degreesOfFreedom=results.df_resid
    significanceLevel=0.05

    tstat=(coefficientValue-referenceCoefficient)/standardErrorEstimator
    if(oneCaudalTestType=='smaller'): 
        pvalue=scipy.stats.t.cdf(tstat,degreesOfFreedom)
    else:
        pvalue=1-scipy.stats.t.cdf(tstat,degreesOfFreedom)        

    conf1=coefficientValue-scipy.stats.t.ppf(1.0-significanceLevel, degreesOfFreedom)*standardErrorEstimator
    conf2=coefficientValue+scipy.stats.t.ppf(1.0-significanceLevel, degreesOfFreedom)*standardErrorEstimator    


    print "=================================================================="
    print "coef"+"     "+"std err"+"     "+"  t  "+"     "+"pvalue"+"     "+"[95%   conf. int]"
    print "{0:.4f}".format(coefficientValue),"  ","{0:.4f}".format(standardErrorEstimator),"   ","{0:.4f}".format(tstat),"   ","{0:.4f}".format(pvalue),"   ","{0:.4f}".format(conf1)," ","{0:.4f}".format(conf2)


if __name__ == '__main__':

    df = pd.read_csv('/home/daniel/Documents/Projetos/Prorum/Python For Econometrics/DataSets/Txt/WAGE1.raw',delim_whitespace=True,header=None)
    df.columns=['wage','educ','exper','tenure','nonwhite','female','married','numdep','smsa','northcen','south','west','construc','ndurman','trcommpu','trade',\
    'services','profserv','profocc','clerocc','servocc','lwage','expersq','tenursq']

    # Multiple regression
    y,X = ps.dmatrices('np.log(wage) ~ educ + exper + tenure',data=df, return_type='dataframe')
    model = sm.OLS(y,X) # Describe Model
    results = model.fit() # Fit model
    print results.summary()

    # This result is already presented above. However, you can test a specific hypothesis using the
    # code below    
    hypotheses = 'exper = 0.0'
    t_test = results.t_test(hypotheses)
    print(t_test)

    # Alternative test
    #H0: beta1=0 e  H1: beta1>0
    coefficient='educ'
    referenceCoefficient=0
    significantLevel=0.05
    oneCaudalTestType='larger'
    tTestOneTail(coefficient,referenceCoefficient,results,significanceLevel,oneCaudalTestType)

Exemplo 4.2:

import numpy as np
import statsmodels.api as sm
import matplotlib as plt
import pandas as pd
import patsy as ps

# Ex. 4.2

if __name__ == '__main__':

    df = pd.read_csv('/home/daniel/Documents/Projetos/Prorum/Python For Econometrics/DataSets/Txt/MEAP93.raw',delim_whitespace=True,header=None)
    df.columns=['lnchprg','enroll','staff','expend','salary','benefits','droprate','gradrate','math10','sci11','totcomp','ltotcomp','lexpend','lenroll','lstaff','bensal','lsalary']

    # Multiple regression
    y,X = ps.dmatrices('math10 ~ totcomp + staff + enroll',data=df, return_type='dataframe')
    model = sm.OLS(y,X) # Describe Model
    results = model.fit() # Fit model
    print results.summary()

    # This result is already presented above. However, you can test a specific hypothesis using the
    # code below    
    hypotheses = 'enroll = 0.0'
    t_test = results.t_test(hypotheses)
    print(t_test)

Exemplo 4.3:

import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import pandas as pd
import patsy as ps

# Examples 4.3

if __name__ == '__main__':

    df = pd.read_csv('/home/daniel/Documents/Projetos/Prorum/Python For Econometrics/DataSets/Txt/GPA1.raw',delim_whitespace=True,header=None)
    df.columns=['age','soph','junior','senior','senior5','male','campus','business','engineer','colGPA','hsGPA','ACT','job19','job20','drive','bike','walk','voluntr','PC','greek','car','siblings','bgfriend', 
    'clubs','skipped','alcohol','gradMI','fathcoll','mothcoll']


    # Multiple Regression
    y,X = ps.dmatrices('colGPA ~ hsGPA + ACT + skipped',data=df, return_type='dataframe')
    model = sm.OLS(y,X) # Describe Model
    results = model.fit() # Fit model
    print results.summary()

    # This result is already presented above. However, you can test a specific hypothesis using the
    # code below
    hypotheses = 'hsGPA = 0.0'
    t_test = results.t_test(hypotheses)
    print(t_test)    

Exemplo 4.4:

import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import pandas as pd
import patsy as ps
import scipy

# Example 4.4

def tTestOneTail(coefficient,referenceCoefficient,results,significanceLevel,oneCaudalTestType):
    coefficientValue=results.params[coefficient]

    standardErrorEstimator=results.bse[coefficient]
    degreesOfFreedom=results.df_resid
    significanceLevel=0.05

    tstat=(coefficientValue-referenceCoefficient)/standardErrorEstimator
    if(oneCaudalTestType=='smaller'):
        pvalue=scipy.stats.t.cdf(tstat,degreesOfFreedom)
    else:
        pvalue=1-scipy.stats.t.cdf(tstat,degreesOfFreedom)        

    conf1=coefficientValue-scipy.stats.t.ppf(1.0-significanceLevel, degreesOfFreedom)*standardErrorEstimator
    conf2=coefficientValue+scipy.stats.t.ppf(1.0-significanceLevel, degreesOfFreedom)*standardErrorEstimator    
    print "degreesOfFreedom",degreesOfFreedom
    print "One sided 5% critical value",scipy.stats.t.ppf(1.0-significanceLevel, degreesOfFreedom)

    print "=================================================================="
    print "coef"+"     "+"std err"+"     "+"  t  "+"     "+"pvalue"+"     "+"[95%   conf. int]"
    print "{0:.4f}".format(coefficientValue),"  ","{0:.4f}".format(standardErrorEstimator),"   ","{0:.4f}".format(tstat),"   ","{0:.4f}".format(pvalue),"   ","{0:.4f}".format(conf1)," ","{0:.4f}".format(conf2)


if __name__ == '__main__':

    df = pd.read_csv('/home/daniel/Documents/Projetos/Prorum/Python For Econometrics/DataSets/Txt/campus.raw',delim_whitespace=True,header=None)
    df.columns=['enrol','priv','police','crime','lcrime','lenroll','lpolice']


    # Multiple Regression
    y,X = ps.dmatrices('np.log(crime) ~ np.log(enrol)',data=df, return_type='dataframe')
    model = sm.OLS(y,X) # Describe Model
    results = model.fit() # Fit model
    print results.summary()

    # This result is already presented above. However, you can test a specific hypothesis using the
    # code below
    #H0: beta1=0
    hypotheses = 'np.log(enrol) = 0.0'
    t_test = results.t_test(hypotheses)
    print(t_test)    

    #H0: beta1=1
    hypotheses = 'np.log(enrol) = 1.0'
    t_test = results.t_test(hypotheses)
    print(t_test)    

    #H0: beta1=1 e  H1: beta1>1

    coefficient='np.log(enrol)'
    referenceCoefficient=1
    significanceLevel=0.05
    oneCaudalTestType='larger'
    tTestOneTail(coefficient,referenceCoefficient,results,significanceLevel,oneCaudalTestType)

Exemplo 4.5:

import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import pandas as pd
import patsy as ps

# Examples 4.5

if __name__ == '__main__':

    df = pd.read_csv('/home/daniel/Documents/Projetos/Prorum/Python For Econometrics/DataSets/Txt/HPRICE2.raw',delim_whitespace=True,header=None)
    df.columns=['price','crime','nox','rooms','dist','radial','proptax','stratio','lowstat','lprice','lnox','lproptax']

    # Multiple Regression
    y,X = ps.dmatrices('np.log(price) ~ np.log(nox) + np.log(dist) + rooms + stratio',data=df, return_type='dataframe')
    model = sm.OLS(y,X) # Describe Model
    results = model.fit() # Fit model
    print results.summary()

    # This result is already presented above. However, you can test a specific hypothesis using the
    # code below
    #H0: beta1=0
    hypotheses = 'np.log(nox) = -1.0'
    t_test = results.t_test(hypotheses)
    print(t_test)    

Início da Section 4.4: [Teste F]

import numpy as np
import statsmodels.api as sm
import matplotlib as plt
import pandas as pd
import patsy as ps

# Beginning of Section 4.4

if __name__ == '__main__':

    df = pd.read_csv('/home/daniel/Documents/Projetos/Prorum/Python For Econometrics/DataSets/Txt/twoyear.raw',delim_whitespace=True,header=None)
    df.columns=['female','phsrank','BA','AA','black','hispanic','id','exper','jc','univ','lwage','stotal','smcity','medcity','submed','lgcity',
    'sublg','vlgcity','subvlg','ne','nc','south','totcoll']

    #ps.NAAction(on_NA='drop', NA_types=['None', 'NaN'])
    y,X = ps.dmatrices('lwage ~ jc + univ + exper',data=df, return_type='dataframe')
    model = sm.OLS(y,X) # Describe Model
    results = model.fit() # Fit model
    print results.summary()

    # H0: beta1-beta2=0
    hypotheses = '(jc = univ)'
    f_test = results.f_test(hypotheses)    
    print f_test

Início da Section 4.5:

import numpy as np
import statsmodels.api as sm
import matplotlib as plt
import pandas as pd
import patsy as ps

# Beginning of Section 4.5

if __name__ == '__main__':

    df = pd.read_csv('/home/daniel/Documents/Projetos/Prorum/Python For Econometrics/DataSets/Txt/MLB1.raw',delim_whitespace=True,header=None)
    df.columns=['salary','teamsal','nl','years','games','atbats','runs','hits','doubles','triples','hruns','rbis','bavg','bb','so','sbases',
    'fldperc','frstbase','scndbase','shrtstop','thrdbase','outfield','catcher',   
    'yrsallst','hispan','black','whitepop','blackpop','hisppop','pcinc','gamesyr',   
    'hrunsyr','atbatsyr','allstar','slugavg','rbisyr','sbasesyr','runsyr','percwhte',  
    'percblck','perchisp','blckpb','hispph','whtepw','blckph','hisppb','lsalary']

    #ps.NAAction(on_NA='drop', NA_types=['None', 'NaN'])
    y,X = ps.dmatrices('np.log(salary) ~ years + gamesyr + bavg + hrunsyr + rbisyr',data=df, return_type='dataframe')
    model = sm.OLS(y,X) # Describe Model
    results = model.fit() # Fit model
    print results.summary()

    # H0: beta1-beta2=0
    hypotheses = '(bavg = 0),(hrunsyr=0),(rbisyr)'
    f_test = results.f_test(hypotheses)    
    print f_test

Exemplo 4.6:

import numpy as np
import statsmodels.api as sm
import matplotlib as plt
import pandas as pd
import patsy as ps

# Ex. 4.6

if __name__ == '__main__':

    df = pd.read_csv('/home/daniel/Documents/Projetos/Prorum/Python For Econometrics/DataSets/Txt/401K.raw',delim_whitespace=True,header=None)
    df.columns=['prate','mrate','totpart','totelg','age','totemp','sole','ltotemp']

    y,X = ps.dmatrices('prate ~ mrate + age',data=df, return_type='dataframe')
    model = sm.OLS(y,X) # Describe Model
    results = model.fit() # Fit model
    print results.summary()

Exemplo 4.7:

import numpy as np
import statsmodels.api as sm
import matplotlib as plt
import pandas as pd
import patsy as ps

# Ex. 4.7

if __name__ == '__main__':

    df = pd.read_csv('/home/daniel/Documents/Projetos/Prorum/Python For Econometrics/DataSets/Txt/JTRAIN.raw',delim_whitespace=True,header=None,na_values=".")
    df.columns=['year','fcode','employ','sales','avgsal','scrap','rework','tothrs',
    'union','grant','d89','d88','totrain','hrsemp','lscrap','lemploy',  
    'lsales','lrework','lhrsemp','lscrap_1','grant_1','clscrap','cgrant',    
    'clemploy','clsales','lavgsal','clavgsal','cgrant_1','chrsemp','clhrsemp'] 

    ps.NAAction(on_NA='drop', NA_types=['None', 'NaN'])
    y,X = ps.dmatrices('np.log(scrap) ~ hrsemp + np.log(sales) + np.log(employ)',data=df, return_type='dataframe')
    model = sm.OLS(y,X) # Describe Model
    results = model.fit() # Fit model
    print results.summary()

Exemplo 4.8:

import numpy as np
import statsmodels.api as sm
import matplotlib as plt
import pandas as pd
import patsy as ps

# Ex. 4.8

if __name__ == '__main__':

    df = pd.read_csv('/home/daniel/Documents/Projetos/Prorum/Python For Econometrics/DataSets/Txt/RDCHEM.raw',delim_whitespace=True,header=None)
    df.columns=['rd','sales','profits','rdintens','profmarg','salessq','lsales','lrd']  

    #ps.NAAction(on_NA='drop', NA_types=['None', 'NaN'])
    y,X = ps.dmatrices('np.log(rd) ~ np.log(sales) + profmarg',data=df, return_type='dataframe')
    model = sm.OLS(y,X) # Describe Model
    results = model.fit() # Fit model
    print results.summary()

Exemplo 4.9:

import numpy as np
import statsmodels.api as sm
import matplotlib as plt
import pandas as pd
import patsy as ps

# Ex. 4.9

if __name__ == '__main__':

    df = pd.read_csv('/home/daniel/Documents/Projetos/Prorum/Python For Econometrics/DataSets/Txt/BWGHT.raw',delim_whitespace=True,header=None,na_values=".")
    df.columns=['faminc','cigtax','cigprice','bwght','fatheduc','motheduc','parity','male','white','cigs','lbwght','bwghtlbs','packs','lfaminc']


    ps.NAAction(on_NA='drop', NA_types=['None', 'NaN'])
    y,X = ps.dmatrices('bwght ~ cigs + parity + faminc + motheduc + fatheduc',data=df, return_type='dataframe')
    model = sm.OLS(y,X) # Describe Model
    results = model.fit() # Fit model
    print results.summary()

    # H0: beta4=0 and beta5=0
    hypotheses = '(motheduc = 0),(fatheduc=0)'
    f_test = results.f_test(hypotheses)    
    print f_test
...