You are on page 1of 10

Data Analysis and Statistical Inference

Introduction to Data
# Print the number of rows and variables with the 'dim' function:
dim(present)
# Print the names of the variables of the data frame:
names(present)
# Find the number of boys born each year, and assign your answer to
num_boys <- present$boys
# Type here the code to create the plot(x,y):
plot(present$year,present$girls)
# Create the [line]plot here:
plot(present$year,present$girls,type="l")
# Check when boys outnumber girls
present$boys > present$girls
# Plot the boy-to-girl ratio for every year:
plot(present$year,present$boys/present$girls)
# Load the cdc data frame into the workspace:
load(url("http://assets.datacamp.com/course/dasi/cdc.Rdata"))
# Print the head and tails of the data frame:
head(cdc)
tail(cdc)
# You can use functions mean, var and median to calculate the (surprise, surprise)
mean, variance and median of certain variables of your data frame.
# The function summary() returns a numerical summary: minimum, first
quartile, median,mean, third quartile, and maximum.
mean(cdc$weight)
var(cdc$weight)
median(cdc$weight)
summary(cdc$weight)
# Create the frequency table here:
table(cdc$genhlth)
# Create the relative frequency table here:
table(cdc$genhlth)/20000
# Draw the barplot:
barplot(table(cdc$smoke100))
# Plot the mosaicplot:
mosaicplot(gender_smokers)

Assign the height of the 1337th respondent to height_1337 using the row-and-column notation.
(Use names to see what the index of height is.)Assign the weight of the 111th respondent
to weight_111 using the row-and-column notation.
# Create the subsets:
height_1337 <- cdc[1337,5]
weight_111 <- cdc[111,6]
# Print the results:
height_1337
weight_111
# Create the subsets:
first8 <- cdc[1:8,3:5]
wt_gen_10_20 <- cdc[10:20,6:9]
# Print the subsets:
first8
wt_gen_10_20
# Draw the box plot of the respondents heights:
boxplot(cdc$height)
# Draw the box plot of the weights versus smoking:
boxplot(cdc$weight ~ cdc$smoke100)
# Calculate the BMI:
bmi <- (cdc$weight/(cdc$height^2))*703
# Draw the box plot:
boxplot(bmi~cdc$genhlth)
# Draw a histogram of bmi:
hist(bmi)
# And one with breaks set to 50:
hist(bmi, breaks=50)
# And one with breaks set to 100:
hist(bmi, breaks=100)
Probability
# Print the first 9 values of the 'basket' variable
kobe$basket[1:9]
# Try some simulations!
outcomes <- c("heads", "tails")
sample(outcomes, size=1, replace=TRUE)
sample(outcomes, size=1, replace=TRUE)
sample(outcomes, size=1, replace=TRUE)
# Run the simulation:
outcomes <- c("heads", "tails")
sim_fair_coin <- sample(outcomes, size=100, replace=TRUE)
# Print the object:
sim_fair_coin
# Compute the counts of heads and tails:

table(sim_fair_coin)
# Run the simulation:
outcomes <- c("heads", "tails")
sim_unfair_coin <- sample(outcomes, size=100, replace=TRUE, prob=c(.2,.8))
# Print the object:
sim_unfair_coin
# Compute the counts of heads and tails:
table(sim_unfair_coin)
# Run the simulation and assign the result to 'sim_basket'.
outcomes <- c("H", "M")
sim_basket <- sample(outcomes, size=133, replace=TRUE, prob=c(.45,.55))
sim_basket
table(sim_basket)
# Calculate streak lengths:
kobe_streak <- calc_streak(kobe$basket)
sim_streak <- calc_streak(sim_basket)
kobe_streak
sim_streak
# Compute summaries:
summary(kobe_streak)
summary(sim_streak)
# Make bar plots:
kobe_table=table(kobe_streak)
sim_table=table(sim_streak)
barplot(kobe_table)
barplot(sim_table)
Foundations for Inference: Sampling Distributions
# Create the samples:
samp0 <- sample(area, 50)
samp1 <- sample(area, 50)
# Draw the histograms:
hist(samp0)
hist(samp1)
The for loop
# Set up an empty vector of 5000 NAs to store sample means:
sample_means50 <- rep(NA, 5000)
# Take 5000 samples of size 50 of 'area' and store all of them in 'sample_means50'.
for (i in 1:5000) {
samp <- sample(area, 50)
sample_means50[i] <- mean(samp)
}
# View the result. If you want, you can increase the bin width to show more detail by changing
the 'breaks' argument.
hist(sample_means50, breaks = 13)

In the case above, we wanted to iterate the two lines of code inside the curly braces that take a
random sample of size 50 from area then save the mean of that sample into
the sample_means50 vector. Without the for loop, this would be painful.
1. In the first line we initialize a vector. In this case, we created a vector of 5000 NAs called
sample means50. This vector will store values generated within the for
loop. NA means not available, and in this case they're used as placeholders until we fill
in the values with actual sample means. NA is also often used for missing data in R.
2. The second line calls the for loop itself. The syntax can be loosely read as, for every
element i from 1 to 5000, run the following lines of code. You can think of i as the
counter that keeps track of which loop you're on. Therefore, more precisely, the loop will
run once when i=1, then once when i=2, and so on up to i=5000.
3. The body of the for loop is the part inside the curly braces, and this set of code is run for
each value of i. Here, on every loop, we take a random sample of size 50 from area,
take its mean, and store it as the ith element of sample_means50. In order to display
that this is really happening, we asked R to print it at each iteration. This line of code is
optional and is only used for displaying what's going on while the for loop is running.
# The vector 'sample_means50' is initialized with NA values
sample_means50 <- rep(NA, 5000)
# The for loop runs 5000 times, with 'i' taking values 1 up to 5000
for (i in 1:5000) {
# Take a random sample of size 50
samp <- sample(area, 50)
# Store the mean of the sample in the 'sample_means50' vector on the ith place
sample_means50[i] <- mean(samp)
# Print the counter 'i'
print(i)
}
# Print the first few random means
head(sample_means50)
# Initialize the vector to store the means in:
sample_means_small <- rep(NA, 100)
# Run your for loop:
for (i in 1:100) {
samp=sample(area, 50)
sample_means_small[i]=mean(samp)}
# Print the result:
sample_means_small
# Initialize the sample distributions:
sample_means10 <- rep(NA, 5000)
sample_means100 <- rep(NA, 5000)
# Run the for loop:
for (i in 1:5000) {
samp <- sample(area, 10)
sample_means10[i] <- mean(samp)
samp <- sample(area, 100)
sample_means100[i] <- mean(samp)
}

# Take a look at the results:


head(sample_means10)
head(sample_means50) # was already loaded
head(sample_means100)
# Divide the plot in 3 rows:
par(mfrow = c(3, 1))
# Define the limits for the x-axis:
xlimits <- range(sample_means10)
# Draw the histograms:
hist(sample_means10, breaks=20, xlim=xlimits)
hist(sample_means50, breaks=20, xlim=xlimits)
hist(sample_means100, breaks=20, xlim=xlimits)
Foundations for Inference: C
# Calculate the standard error:
se <- sd(samp)/sqrt(60)
# Calculate the lower and upper bounds of your confidence interval and print them:
lower <- sample_mean-1.96*se
upper <- sample_mean+1.96*se
c(lower,upper)
# Initialize 'samp_mean', 'samp_sd' and 'n':
samp_mean <- rep(NA, 50)
samp_sd <- rep(NA, 50)
n <- 60
# For loop goes here:
for (i in 1:50) {
samp=sample(population, n)
samp_mean[i]=mean(samp)
samp_sd[i]=sd(samp)}
# Calculate the interval bounds here:
lower <- samp_mean-(1.96*samp_sd/sqrt(60))
upper <- samp_mean+(1.96*samp_sd/sqrt(60))
# Plotting the confidence intervals:
pop_mean <- mean(population)
plot_ci(lower, upper, pop_mean)
Inference for Numerical Data
# Load the 'inference' function:
load(url("http://assets.datacamp.com/course/dasi/inference.Rdata"))
# Run the inference function:
inference(nc$gained, type="ci", method="simulation", conflevel=0.9, est="mean",
boot_method="perc")
# Adapt the inference function for a 95% CI:
inference(nc$gained, type="ci", method="simulation", conflevel=0.95, est="mean",
boot_method="perc")

# Adapt the inference function for a Standard Error method:


inference(nc$gained, type="ci", method="simulation", conflevel=0.95, est="mean",
boot_method="se")
# Adapt the inference function using the median:
inference(nc$gained, type="ci", method="simulation", conflevel=0.95, est="median",
boot_method="se")
# Adapt the inference function to create a 95% bootstrap interval for the mean age of fathers:
inference(nc$fage, type="ci", method="simulation", conflevel=0.95, est="mean",
boot_method="se")
The box plots show how the medians of the two distributions compare, but we can also compare
the means of the distributions using the by() function to split theweight variable into
the habit groups, and then take the mean of each using the mean() function.You can use
the by() function as follows to compare the means of the groups: by(numerical_dataset,
categorical_dataset, mean)
Inference for Categorical Data
# Your code to create the 'us12' subset:
us12 <- subset(atheism, atheism$nationality == "United States" & atheism$year =="2012")
# Calculate the proportion of atheist responses:
proportion <- nrow(subset(us12,response == 'atheist'))/nrow(us12)
# Print the proportion:
Proportion
# The subset for India for 2012:
india <- subset(atheism, atheism$nationality == "India" & atheism$year =="2012")
# The analysis using the 'inference' function:
inference(india$response, est = "proportion", type = "ci", method = "theoretical", success =
"atheist")
# The subset for China for 2012:
china <- subset(atheism, atheism$nationality == "China" & atheism$year =="2012")
# The analysis using the 'inference' function:
inference(china$response, est = "proportion", type = "ci", method = "theoretical", success =
"atheist")
# The first step is to make a vector p that is a sequence from 0 to 1 with each number separated
by 0.01:
n <- 1000
p <- seq(0, 1, 0.01)
# We then create a vector of the margin of error (me) associated with each of these values of p
using the familiar approximate formula (ME = 2 X SE):
me <- 2 * sqrt(p * (1 - p)/n)
# Finally, plot the two vectors against each other to reveal their relationship:
plot(me ~ p)
# Take the 'spain' subset:
spain <- subset(atheism, atheism$nationality == "Spain")
# Calculate the proportion of atheists from the 'spain' subset:

proportion <- nrow(subset(spain,response == 'atheist'))/nrow(spain)


# Use the inference function:
inference(spain$response, spain$year, est = "proportion", type = "ci", method = "theoretical",
success = "atheist")
# Take the 'us' subset:
us <- subset(atheism, atheism$nationality == "United States")
# Calculate the proportion of atheists from the 'us' subset:
proportion <- nrow(subset(us,response == 'atheist'))/nrow(us)
# Use the inference function:
inference(us$response, us$year, est = "proportion", type = "ci", method = "theoretical", success
= "atheist")
Linear Regression
# Calculate the correlation between runs and at_bats:
correlation <- cor(mlb11$runs, mlb11$at_bats)
# Print the result:
correlation
print(correlation)
Just as we used the mean and standard deviation to summarize a single variable, we can
summarize the relationship between these two variables by finding the line that best follows
their association.
Use the following function to select the line that you think does the best job of going through the
cloud of points.
plot_ss(x = mlb11$at_bats, y = mlb11$runs, x1, y1, x2, y2)
This function will first draw a scatterplot of the first two arguments x and y. Then it draws two
points (x1, y1) and (x2, y2) that are shown as red circles. These points are used to draw the line
that represents the regression estimate. The line you specified is shown in black and the
residuals in blue. Note that there are 30 residuals, one for each of the 30 observations. Recall
that the residuals are the difference between the observed values and the values predicted by
the line.
The most common way to do linear regression is to select the line that minimizes the sum of
squared residuals. To visualize the squared residuals, you can rerun the plot_ss() command and
add the argument showSquares = TRUE.
plot_ss(x = mlb11$at_bats, y = mlb11$runs, x1, y1, x2, y2, showSquares = TRUE)
Note that the output from the plot_ss() function provides you with the slope and intercept of your
line as well as the sum of squares.
# This is a random estimate:
x1 <- 5400
y1 <- 750
x2 <- 5700
y2 <- 650
plot_ss(x = mlb11$at_bats, y = mlb11$runs, x1, y1, x2, y2, showSquares = TRUE)
# This is another one:
plot_ss(x = mlb11$at_bats, y = mlb11$runs, 5400, 550, 5700, 650, showSquares = TRUE)
# Adapt the function to plot the best fitting line:
plot_ss(x = mlb11$at_bats, y = mlb11$runs, leastSquares = TRUE, showSquares = TRUE)
# Use the 'lm' function to make the linear model:

m1 <- lm(runs ~ at_bats, data = mlb11)


# Print the model:
m1
# Use the 'lm' function to make the linear model. Print the summary:
summary(lm(runs ~ homeruns, data = mlb11))
OR, less simply: # Use the 'lm' function to make the linear model. Print the summary:
m2=lm(runs ~ homeruns, data = mlb11)
print(m2)
summary(m2)
# Create a scatterplot:
plot(mlb11$runs ~ mlb11$at_bats)
# The linear model:
m1 <- lm(runs ~ at_bats, data = mlb11)
# Plot the least squares line:
abline(m1)
Multiple Linear Regression
# Create a scatterplot for 'age' vs 'bty_avg':
plot(evals$age, evals$bty_avg)
# Create a boxplot for 'age' and 'gender':
boxplot(evals$age ~ evals$gender)
# Create a mosaic plot for 'rank' and 'gender':
mosaicplot(evals$rank ~ evals$gender)
# Create a scatterplot for 'score' and 'bty_avg':
plot(evals$score ~ evals$bty_avg)
# Apply 'jitter' on the 'bty_avg' or 'score' variable of your initial plot:
plot(evals$score ~ jitter(evals$bty_avg))
plot(jitter(evals$score) ~ evals$bty_avg)
# Your initial plot:
plot(evals$score ~ jitter(evals$bty_avg))
# Construct the linear model:
m_bty <- lm(score ~ bty_avg, data = evals)
# Plot your linear model on your plot:
abline(m_bty)
Let's take a look at the relationship between one of these scores and the average beauty score.
# Your scatterplot:
plot(evals$bty_avg ~ evals$bty_f1lower)
# The correlation:
cor(evals$bty_avg, evals$bty_f1lower)
# The relationships between all beauty variables:
plot(evals[, 13:19])
# Your new linear model:
m_bty_gen <- lm(score ~ bty_avg + gender, data = evals)

# Study the outcome:


summary(m_bty_gen)
Plot this line and the line corresponding to males with the following custom
function multiLines(m_bty_gen)
# Your plot:
multiLines(m_bty_gen)
# Your linear model with rank and average beauty:
m_bty_rank <- lm(score ~ bty_avg + rank, data = evals)
# View the regression output:
summary(m_bty_rank)
# The full model:
m_full <- lm(score ~ rank + ethnicity + gender + language + age + cls_perc_eval + cls_students
+ cls_level + cls_profs + cls_credits + bty_avg, data = evals)
# View the regression output:
summary(m_full)
# The full model:
m_full <- lm(score ~ rank + ethnicity + gender + language + age + cls_perc_eval + cls_students
+ cls_level + cls_profs + cls_credits + bty_avg, data = evals)
# Your new model:
m_new <- lm(score ~ rank + ethnicity + gender + language + age + cls_perc_eval +
cls_students + cls_level + cls_credits + bty_avg, data = evals)
# View the regression output:
summary(m_new)
# The full model:
m_full <- lm(score ~ rank + ethnicity + gender + language + age + cls_perc_eval + cls_students
+ cls_level + cls_profs + cls_credits + bty_avg, data = evals)
summary(m_full)$adj.r.squared
# Remove rank:
m1 <- lm(score ~ ethnicity + gender + language + age + cls_perc_eval + cls_students +
cls_level + cls_profs + cls_credits + bty_avg, data = evals)
summary(m1)$adj.r.squared
# Remove ethnicity:
m2 <- lm(score ~ rank + gender + language + age + cls_perc_eval + cls_students + cls_level +
cls_profs + cls_credits + bty_avg, data = evals)
summary(m2)$adj.r.squared
# Remove gender:
m3 <- lm(score ~ rank + ethnicity + language + age + cls_perc_eval + cls_students + cls_level +
cls_profs + cls_credits + bty_avg, data = evals)
summary(m3)$adj.r.squared
# Remove language:
m4 <- lm(score ~ rank + ethnicity + gender + age + cls_perc_eval + cls_students + cls_level +
cls_profs + cls_credits + bty_avg, data = evals)
summary(m4)$adj.r.squared
# Remove age:
m5 <- lm(score ~ rank + ethnicity + gender + language + cls_perc_eval + cls_students +
cls_level + cls_profs + cls_credits + bty_avg, data = evals)

summary(m5)$adj.r.squared
# Remove cls_perc_eval:
m6 <- lm(score ~ rank + ethnicity + gender + language + age + cls_students + cls_level +
cls_profs + cls_credits + bty_avg, data = evals)
summary(m6)$adj.r.squared
# Remove cls_students:
m7 <- lm(score ~ rank + ethnicity + gender + language + age + cls_perc_eval + cls_level +
cls_profs + cls_credits + bty_avg, data = evals)
summary(m7)$adj.r.squared
# Remove cls_level:
m8 <- lm(score ~ rank + ethnicity + gender + language + age + cls_perc_eval + cls_students +
cls_profs + cls_credits + bty_avg, data = evals)
summary(m8)$adj.r.squared
# Remove cls_profs:
m9 <- lm(score ~ rank + ethnicity + gender + language + age + cls_perc_eval + cls_students +
cls_level + cls_credits + bty_avg, data = evals)
summary(m9)$adj.r.squared
# Remove cls_credits:
m10 <- lm(score ~ rank + ethnicity + gender + language + age + cls_perc_eval + cls_students +
cls_level + cls_profs + bty_avg, data = evals)
summary(m10)$adj.r.squared
# Remove bty_avg:
m11 <- lm(score ~ rank + ethnicity + gender + language + age + cls_perc_eval + cls_students +
cls_level + cls_profs + cls_credits, data = evals)
summary(m11)$adj.r.squared

You might also like