Health Insights


# Title: Life Expectancy Data Research
# Author: Justin Rzepko


data <- read.csv("1a-lifeExpect.csv")
data_missing <- read.csv("1a-lifeExpectMissing.csv", na.strings = c("", ".", "*"))


data_missing$People.TV <- as.numeric(data_missing$People.TV)

colSums(is.na(data_missing))
str(data)

# Fit a regression model for Life.Expectancy
model_life_exp <- lm(Life.Expectancy ~ People.TV + People.Dr, data = data_missing, na.action = na.exclude); model_life_exp

summary(model_life_exp)$r.squared

# Scatterplot of People.DR vs. Life.Expectancy
plot(data_missing$People.Dr, data_missing$Life.Expectancy,
     main = "People.DR vs. Life Expectancy",
     xlab = "People.DR",
     ylab = "Life Expectancy",
     pch = 19, col = "blue")

# Add the regression line
abline(lm(Life.Expectancy ~ People.Dr, data = data_missing), col = "red", lwd = 2)


# Predict Life.Expectancy using the model
predicted_values <- predict(model_life_exp, newdata = data_missing)

# Create scatterplot: Observed vs. Predicted
plot(data_missing$Life.Expectancy, predicted_values, 
     main = "Observed vs. Predicted Life Expectancy",
     xlab = "Observed Life Expectancy",
     ylab = "Predicted Life Expectancy",
     pch = 19, col = "blue")

# Add a reference line for perfect prediction
abline(0, 1, col = "red", lwd = 2)


# Find rows where Female.Life.Expectancy is missing
missing_indices_flife_exp <- which(is.na(data_missing$Female.Life.Expectancy)); missing_indices_flife_exp

# Predict Female.Life.Expectancy for missing rows
predicted_flife_exp <- predict(model_life_exp, newdata = data_missing[missing_indices_flife_exp, ]); predicted_flife_exp

# Replace missing Female.Life.Expectancy values with predictions
data_missing$Female.Life.Expectancy[missing_indices_flife_exp] <- round(predicted_flife_exp)

# Convert Country column to a factor if not already
data_missing$Country <- as.factor(data_missing$Country)

# Fit a logistic regression model to predict Country based on People.TV and People.DR
model_country <- glm(Country ~ People.TV + People.Dr, data = data_missing, family = binomial, na.action = na.exclude)
summary(model_country)

# Find rows where Country is missing
missing_indices_country <- which(is.na(data_missing$Country))

# Predict probabilities for the missing rows
predicted_probs_country <- predict(model_country, newdata = data_missing[missing_indices_country, ], type = "response")

# Assign the most likely Country based on predicted probabilities
data_missing$Country[missing_indices_country] <- ifelse(predicted_probs_country > 0.5, "Country1", "Country2")


# Summary of the dataset
summary(data)


mean(data$Life.Expectancy, na.rm = TRUE)
# On average, individualsd in the dataset have a life expectancy of approximately
# 67 years.
median(data$Life.Expectancy, na.rm = TRUE)

sd(data$Life.Expectancy, na.rm = TRUE)
range(data$Life.Expectancy, na.rm = TRUE)


# Check for missing values
colSums(is.na(data))

# Impute missing values (example: using the mean for numeric variables)
# Convert People.TV and People.DR to numeric
data$People.TV <- as.numeric(as.character(data$People.TV))
data$People.DR <- as.numeric(as.character(data$People.Dr))

data$Life.Expectancy[is.na(data$Life.Expectancy)] <- mean(data$Life.Expectancy, na.rm = TRUE)
data$People.TV[is.na(data$People.TV)] <- mean(data$People.TV, na.rm = TRUE)
data$People.DR[is.na(data$People.Dr)] <- mean(data$People.Dr, na.rm = TRUE)


# Boxplot for life expectancy by sex
boxplot(data$Male.Life.Expectancy, data$Female.Life.Expectancy,
        names = c("Male", "Female"),
        main = "Life Expectancy by Sex",
        ylab = "Life Expectancy (Years)")

# Histogram for People/TV and People/Doctor Ratios

par(mfrow = c(1, 2))
hist(data$People.TV, main = "Distribution of People per TV", xlab = "People/TV Ratio", breaks = 10)
hist(data$People.Dr, main = "Distribution of People per Doctor", xlab = "People/Doctor Ratio", breaks = 10)

# Research Question 1: Do life expectancies differ by sex?
# Perform a paired t-test
t_test_result <- t.test(data$Male.Life.Expectancy, data$Female.Life.Expectancy, paired = TRUE)
print(t_test_result)


# Research Question 2: Does TV or Doctor Ratio associate with life expectancies?

# Calculate correlation manually
cor_tv <- cor(data$People.TV, data$Life.Expectancy, use = "complete.obs")
cor_dr <- cor(data$People.Dr, data$Life.Expectancy, use = "complete.obs")

print(cor_tv)
print(cor_dr)

# Scatterplots
plot(data$People.TV, data$Life.Expectancy, main = "Life Expectancy vs. People per TV",
     xlab = "People/TV Ratio", ylab = "Life Expectancy", pch = 19)
abline(lm(data$Life.Expectancy ~ data$People.TV), col = "red")

plot(data$People.Dr, data$Life.Expectancy, main = "Life Expectancy vs. People per Doctor",
     xlab = "People/Doctor Ratio", ylab = "Life Expectancy", pch = 19)
abline(lm(data$Life.Expectancy ~ data$People.Dr), col = "blue")


# Multiple Linear Regression
model <- lm(Life.Expectancy ~ People.TV + People.Dr, data = data)
summary(model)

# Manually check residuals for assumptions
plot(model$residuals, main = "Residuals Plot", ylab = "Residuals", xlab = "Fitted Values")
abline(h = 0, col = "red")