* Example of fitting models with categorical covariates; * done in class on November 7, 9 2017; Libname S3A3 "C:\Users\Angelo\Documents\McMaster Teaching\STAT 3A03\Fall 2017\Data"; * Gender differences in Computer Science Data; Data S3A3.CSData1; Set S3A3.CSData; Sex=Sex-1; run; PROC REG Data=S3A3.CSData1 plots=none; Model GPA=Sex; run; PROC REG Data=S3A3.CSData1 plots=none; Model SATM=Sex; Plot Student.*Pred.; Plot Student.*nqq.; run; * Salary Data; PROC IMPORT Out=S3A3.Salary Datafile="C:\Users\Angelo\Documents\McMaster Teaching\STAT 3A03\Fall 2017\Data\Salary.txt" DBMS=DLM REPLACE; Getnames=YES; Datarow=2; run; * Since Educ is categorical we need to define dummy variables; Data S3A3.Salary1; set S3A3.Salary; * Construct the dummy variables with 0-1 coding; if Educ=1 then E1=1; else E1=0; if Educ=2 then E2=1; else E2=0; if Educ=3 then E3=1; else E3=0; run; PROC REG Data=S3A3.Salary1 plots=none; * In this model I will use Category 1 as the reference; Model Salary=E2 E3; run; PROC REG Data=S3A3.Salary1 plots=none; * In this model I will use Category 3 as the reference; Model Salary=E1 E2; run; * Using PROC GLM; PROC GLM Data=S3A3.salary; Class Educ; Model Salary=Educ /solution; run; * Now looking at Experience as a determinant of salary; PROC GPLOT Data=S3A3.salary; Plot Salary*Exp; run; PROC REG Data=S3A3.salary plots=none; Model Salary=Exp; Plot Salary*Exp; run; *The scatterplot of salary by experience again; *This time we use different symbols for education level; PROC GPLOT Data=S3A3.salary; Symbol1 color=red Value=Squarefilled; Symbol2 color=green Value=Diamondfilled; Symbol3 color=blue Value=trianglefilled; Plot Salary*Exp=Educ; run; *Reset the symbol definitions; Symbol1; Symbol2; Symbol3; * Including both Education level and experience; PROC REG Data=S3A3.salary1 plots=none; Model Salary=E2 E3 Exp; run; * Testing the Effect of Education; PROC REG Data=S3A3.salary1 plots=none; Model Salary=E2 E3 Exp; ods output anova=anova_full; run; Data anova_full; set anova_full; If source='Error' then call symput ('sse_full', ss); If source='Error' then call symput ('df_full', df); run; PROC REG Data=S3A3.salary1 plots=none; Model Salary=Exp; ods output anova=anova_red; run; Data anova_red; set anova_red; If source='Error' then call symput ('sse_red', ss); If source='Error' then call symput ('df_red', df); run; Data educ_Ftest; df1=&df_red-&df_full; df2=&df_full; stat=((&sse_red-&sse_full)/df1)/(&sse_full/df2); pvalue=1-CDF('F', stat, df1, df2); run; *Using PROC GLM we get a plot with the three fitted lines; PROC GLM Data=S3A3.salary; Class Educ (Ref='1'); Model Salary=Educ Exp /solution; run; *Next we look at the model with 2 categorical covariates; *Manage is already coded as 0-1 so we can use it directly; PROC REG Data=S3A3.salary1 plots=none; Model Salary=E2 E3 Manage; run; PROC GLM Data=S3A3.salary; Class Educ (Ref='1') Manage (Ref='0'); Model Salary=Educ Manage /solution; run; * Finally fit a model with both categorical covariates and experience; * We get 6 parallel lines with additive structure for the intercepts; PROC GLM Data=S3A3.salary; Class Educ (Ref='1') Manage (Ref='0'); Model Salary=Educ Manage Exp/solution; Output Out=salary_out Predicted=Fitted Residual=Resid_raw Student=Resid_stud CookD=CookD H=Leverage; run; Data salary_out; set salary_out; index=_N_; run; * Diagnostic plots; PROC GPLOT Data=salary_out; Goptions Reset=All; Plot Resid.stud*Fitted /VREF=0; Plot Fitted*Salary; Plot Leverage*index; Plot CookD*index; run; * The normal qq-plot needs to be done separately using PROC Univariate; PROC Univariate data=salary_out noprint; qqplot Resid_stud; run;