# Reading data (Creating a dataframe)

• mydata=read.table(file=file.choose()) #use to read in the txt files for the textbook exercises
• newdata <- na.omit(mydata)

# Commands for dataframes

• mydata #shows the entire data set
• head(mydata) #shows the first 6 rows
• tail(mydata) #shows the last 6 rows
• str(mydata) #shows the variable names and types
• names(mydata) #shows the variable names
• rename(V1,Variable1, dataFrame=mydata) #renames V1 to Variable1; Note that epicalc package must be installed!
• ls() #shows a list of objects that are available

# Descriptive Statistics

• mean(x) #computes the mean of the variable x
• median(x) #computes the median of the variable x
• sd(x) #computes the standard deviation of the variable x
• IQR(x) #computer the IQR of the variable x
• summary(x) #computes the 5-number summary and the mean of the variable x
• cor(x,y) #computes the correlation coefficient
• cor(mydata) #computes a correlation matrix

# Graphical Displays

• hist(x) #creates a histogram for the variable x
• boxplot(x) # creates a boxplot for the variable x
• boxplot(y~x) # creates side-by-side boxplots
• stem(x) #creates a stem plot for the variable x
• plot(y~x) #creates a scatterplot of y versus x
• plot(mydata) #provides a scatterplot matrix
• abline(lm(y~x)) #adds regression line to plot

# Probability Distributions

• dbinom(x, n, p) #binomial density, probability
• pbinom(x, n, p) #cumulative binomial probability
• qbinom(q, n, p) # qth percentile for a binomial distribution
• pnorm(x, mean, sd) #cumulative probability for a Normal distribution
• qnorm(q, mean, sd # find qth percentile for a Normal distribution
• dgeom(y, p) provides the probability of y failures before the first success for a geometric(p) R.V.
• dnbinom(y, r, p) provides the probability of y failures before the rth succcess, each with probability p
• dpois(x, lamda) provides the probability of getting exactly x values, when the mean is lamda

# Statistical Inference

• z.test(x) #NEED BSDA package; get a one sample z test
• t.test(x) #get a one sample t test
• t.test(x,y) #get a two sample t test
• t.test(x, y, paired=TRUE) #get a paired t test
• cor.test(x,y) #test plus CI for rho

# Analysis of Variance

• t.test(y~x, var.equal=TRUE) #pooled t-test where x is a factor
• x=as.factor(x) #coerce x to be a factor variable
• tapply(y, x, mean) #get mean of y at each level of x
• tapply(y, x, sd) #get stadard deviations of y at each level of x
• tapply(y, x, length) #get sample sizes of y at each level of x
• plotmeans(y~x) #means and 95% confidence intervals
• AOVmodel=aov(y~x) #one-way ANOVA
• summary(AOVmodel) #get ANOVA output
• oneway.test(y~x, var.equal=TRUE) #one-way test output
• levene.test(y,x) #Levene's test for equal variances
• blockmodel=aov(y~x+block) #Randomized block design model with "block" as a variable
• tapply(lm(y~x1:x2,mean) #get the mean of y for each cell of x1 by x2
• anova(lm(y~x1+x2)) #a way to get a two-way ANOVA table
• interaction.plot(FactorA, FactorB, y) #get an interaction plot
• TukeyHSD(AOVmodel) #get Tukey CIs and P-values
• plot(TukeyHSD(AOVmodel)) #get 95% family-wise CIs
• contrast=rbind(c(.5,.5,-1/3,-1/3,-1/3)) #set up a contrast
• summary(glht(AOVmodel, linfct=mcp(x=contrast))) #test a contrast
• confint(glht(AOVmodel, linfct=mcp(x=contrast))) #CI for a contrast

# Liner Regression Models

• regmodel=lm(y~x) #fit a regression model
• summary(regmodel) #get results from fitting the regression model
• anova(regmodel) #get the ANOVA table fro the regression fit
• plot(regmodel) #get four plots, including normal probability plot, of residuals
• fits=regmodel\$fitted #store the fitted values in variable named "fits"
• resids=regmodel\$residuals #store the residual values in a varaible named "resids"
• sresids=rstandard(regmodel) #store the standardized residuals in a variable named "sresids"
• studresids=rstudent(regmodel) #store the studentized residuals in a variable named "studresids"
• beta1hat=regmodel\$coeff[2] #assign the slope coefficient to the name "beta1hat"
• qt(.975,15) # find the 97.5% percentile for a t distribution with 15 df
• confint(regmodel) #CIs for all parameters
• newx=data.frame(X=41) #create a new data frame with one new x* value of 41
• predict.lm(regmodel,newx,interval="confidence") #get a CI for the mean at the value x*
• predict.lm(model,newx,interval="prediction") #get a prediction interval for an individual Y value at the value x*
• hatvalues(regmodel) #get the leverage values (hi)
• Diagnostics
• sresids=rstandard(regmodel) #store the standardized residuals in a variable named "sresids"
• standresid=stdres(regmodel) #store the standardized residuals in a variable named "standresids"
• stud.del.resids=rstudent(regmodel) #store the studentized deleted residuals in a variable named "stud.del.resids"
• hatvalues(regmodel) #get the leverage values (hi)
• cooks.distance(regmodel) #get Cook's distance
• dfbetas(regmodel) #print all dfbetas
• dfbetas(regmodel)[4,1] #dfbeta for case 4, first coefficient (i.e., b_0)
• dffits(regmodel) [4] #dffits for case 4
• influence(regmodel) #various influence statistics, including hat values and dfbeta (not dfbetas) values
• library(car) #load the package car
• vif(regmodel) #variance inflation factors
• Tests for homogeneity of variance
• bptest(regmodel) #get the Breusch-Pagan test (lmtest package must be installed)
• levene.test(Y, groupvariable) #get the Levene test (lawstat package must be installed)
• Tests for normality
• ad.test(resids) #get Anderson-Darling test for normality (nortest package must be installed)
• cvm.test(resids) #get Cramer-von Mises test for normaility (nortest package must be installed)
• lillie.test(resids) #get Lilliefors (Kolmogorov-Smirnov) test for normality (nortest package must be installed)
• pearson.test(resids) #get Pearson chi-square test for normaility (nortest package must be installed)
• sf.test(resids) #get Shapiro-Francia test for normaility (nortest package must be installed)
• Lack-of-fit test
• Reduced=lm(y~x)#fit reduced model
• Full=lm(y~0+as.factor(x)) #fit full model
• anova(Reduced, Full) #get lack-of-fit test
• boxcox(regmodel) #evaluate possible Box-Cox transformations (MASS package must be installed)
• Model Selection
• library(leaps) #load the leaps package
• allmods = regsubsets(y~x1+x2+x3+x4, nbest=2, data=mydata) #(leaps package must be loaded), identify best two models for 1, 2, 3 predictors
• summary(allmods) # get summary of best subsets
• summary(allmods)\$cp #Cp for some models
• plot(allmods, scale="adjr2") # plot that identifies models
• plot(allmods, scale="Cp") # plot that identifies models
• fullmodel=lm(y~., data=mydata) # regress y on everything in mydata
• MSE=(summary(fullmodel)\$sigma)^2 # store MSE for the full model
• extractAIC(lm(y~x1+x2+x3), scale=MSE) #get Cp (equivalent to AIC)
• step(fullmodel, scale=MSE, direction="backward") #backward elimination
• step(fullmodel, scale=MSE, direction="forward") #forward elimination
• step(fullmodel, scale=MSE, direction="both") #stepwise regression
• none(lm(y~1) #regress y on the constant only
• step(none, scope=list(upper=fullmodel), scale=MSE) #use Cp in stepwise regression

# Nonparametric Tests

• binom.test(x, n, p) #Binomial test for proportions
• binom.confint(x, n) #NEED binom package, confidence intervals for a proportion
• SIGN.test(x, md=0) #NEED BSDA package, Sign test
• wilcox.test(x) #Wilcoxon signed rank test
• owa(pre, post) #NEED NSM3 package, Ordered Walsh Averages
• wilcox.test(x, y) #Wilcoxon rank sum test
• wilcox_test(x, y) # NEED coin package - Exact conditional distribution
• kruskal.test(x, g) #Kruskal-Wallis rank sum test; g is a grouping variable
• friedman.test(y~A|B) # y are the data values, A is a grouping factor, and B is a blocking factor
• cor.test(x,y) #correlation test plus CI for several measures of association (r, rho, tau)
• kendall.ci(x, y) #NEED NSM3 package; provides a confidence interval for Kendall's tau