Loading the Data

We used the following libraries: dplyr, readr, ggplot2, corrplot, caret, DescTools, nortest, bestNormalize, MASS. (the code is hidden due to plenty warnings)

setwd("C:\\Users\\Mateusz\\Desktop\\ML I\\Projekt")
delays_train <- read_csv("delays_train.csv")
delays_test <- read_csv("delays_test.csv")

Since there is not enough time to elaborate on data transformation techniques, we will only focus on the most interesting ones we used and the rest will be discussed just briefly. First things first, we changed the data types: categorical and logical variables were transformed into factors.

Missing values

After that, missing values had to be dealt with. We will show you a few techniques selected by us.

1) Converting HHMM format to minutes since midnight
colSums(is.na(delays_train)) %>% sort() %>% as.data.frame ()
##                                .
## Weekday                        0
## Month_of_Year                  0
## Day_of_Month                   0
## Scheduled_Departure_Time       0
## Scheduled_Arrival_Time         0
## Marketing_Airline_DOT_ID       0
## Flight_Number                  0
## Origin_Airport_ID              0
## Flight_Cancelled               0
## Departure_State                0
## Arrival_State                  0
## Arrival_Delay                  0
## Diverted_Airport_Landings      0
## Taxi_Out_Time                  0
## Taxi_In_Time                   0
## Flight_Diverted                0
## Actual_Departure_Time          0
## Origin_Precipitation           0
## Destination_Precipitation      0
## Departure_Delay           141709
## Destination_Airport_ID    142006
## Flight_Distance           142235
## Origin_Temperature        142235
## Destination_Temperature   142235
## Flight_Duration           142276
## Origin_Wind_Speed         142276
## Destination_Wind_Speed    142276
## Marketing_Airline         142322
#delays_train$Departure_Delay #this is a HHMM format

# Converting HHMM format to minutes since midnight
convert_HHMM_to_minutes <- function(time) {
  hours <- time %/% 100  # extracting hours by integer division
  minutes <- time %% 100  # extract minutes by modulus operation
  total_minutes <- hours * 60 + minutes  # Total minutes since midnight
  return(total_minutes)
}

# Calculating the delay only for missing values in Departure_Delay
missing_indices <- is.na(delays_train$Departure_Delay)  #  indices where Departure_Delay is NA

# Applying the conversion to minutes for Scheduled and Actual Times only for missing Departure_Delay

scheduled_minutes <- sapply(delays_train$Scheduled_Departure_Time[missing_indices], convert_HHMM_to_minutes)

actual_minutes <- sapply(delays_train$Actual_Departure_Time[missing_indices], convert_HHMM_to_minutes)

# Computing the Departure_Delay for missing entries

delays_train$Departure_Delay[missing_indices] <- actual_minutes - scheduled_minutes

# Adjusting for the possible overnight cases

delays_train$Departure_Delay[missing_indices] <- ifelse(delays_train$Departure_Delay[missing_indices] < -720,
                                                        delays_train$Departure_Delay[missing_indices] + 1440, # if the flight left e.g just after midnight
                                                                                # we have to add one day to correctly calculate the delay
                                                        delays_train$Departure_Delay[missing_indices])
2) Imputing missing Marketing_Airline values based on Marketing Airline ID
# Imputing missing Marketing_Airline values based on Marketing_Airline_DOT_ID

length(levels(delays_train$Marketing_Airline))
## [1] 10
length(levels(delays_train$Marketing_Airline_DOT_ID))
## [1] 10
airline_codes <- delays_train %>% filter(!is.na(Marketing_Airline)) %>% 
  dplyr::select(Marketing_Airline,Marketing_Airline_DOT_ID) %>% distinct()

delays_train <- delays_train %>% left_join(airline_codes,by="Marketing_Airline_DOT_ID") %>% 
  mutate(Marketing_Airline=coalesce(Marketing_Airline.x,Marketing_Airline.y)) %>% 
  dplyr::select(-c(Marketing_Airline.x,Marketing_Airline.y))
3) Imputing missing values for destination airport with the most common destination airport for each Airline from each Origin Airport
# Destination_Airport_ID:


# imputing missing values for destination airport with the most common destination airport for
# each airline from each Origin Airport

most_common_dest <- delays_train %>%
  filter(!is.na(Destination_Airport_ID)) %>%
  group_by(Marketing_Airline, Origin_Airport_ID,Destination_Airport_ID) %>%
  summarise(Count=n(),.groups = "drop") %>% 
  group_by(Marketing_Airline, Origin_Airport_ID) %>% 
  summarise(Destination_Airport_ID_most_common = Destination_Airport_ID[which.max(Count)], .groups = 'drop')



delays_train <- delays_train %>% left_join(most_common_dest,by=c("Marketing_Airline","Origin_Airport_ID"),
                                           suffix=c("","most_common")) %>% 
  mutate(Destination_Airport_ID =coalesce(Destination_Airport_ID,Destination_Airport_ID_most_common)) %>% 
  dplyr::select(-Destination_Airport_ID_most_common)
4) Imputing missing values for many variables with the median for groups

Finally, often we were filling missing values with the median for groups; here, dealing with Destination Wind Speed, we grouped the data by Destination Airport ID variable. It seemed the most sensible as the Destination Wind Speed is interconnected with the Destination Airport.

# replacing missing Destination_Wind_Speed with a median value for each Destination_Airport_ID 

# median wind speed for each airport:

median_wind_speeds <- delays_train %>% group_by(Destination_Airport_ID) %>% 
  summarize(median_wind_speed = median(Destination_Wind_Speed,na.rm=TRUE),.groups='drop')
  

delays_train <- delays_train %>% left_join(median_wind_speeds,by='Destination_Airport_ID')

delays_train$Destination_Wind_Speed <- coalesce(delays_train$Destination_Wind_Speed,delays_train$median_wind_speed)
# Making sure there're no more missing values
colSums(is.na(delays_train)) %>% sort() %>% as.data.frame()
##                           .
## Weekday                   0
## Month_of_Year             0
## Day_of_Month              0
## Scheduled_Departure_Time  0
## Scheduled_Arrival_Time    0
## Marketing_Airline_DOT_ID  0
## Flight_Number             0
## Origin_Airport_ID         0
## Destination_Airport_ID    0
## Flight_Cancelled          0
## Departure_State           0
## Arrival_State             0
## Departure_Delay           0
## Arrival_Delay             0
## Diverted_Airport_Landings 0
## Taxi_Out_Time             0
## Taxi_In_Time              0
## Flight_Diverted           0
## Actual_Departure_Time     0
## Flight_Duration           0
## Flight_Distance           0
## Origin_Temperature        0
## Destination_Temperature   0
## Origin_Wind_Speed         0
## Destination_Wind_Speed    0
## Origin_Precipitation      0
## Destination_Precipitation 0
## Marketing_Airline         0
## median_wind_speed         0
## median_origin_wind_speed  0
## median_dest_temp          0
## median_origin_temp        0
## mean_Flight_Distance      0
## mean_Flight_Duration      0
colSums(is.na(delays_test)) %>% sort() %>% as.data.frame()
##                           .
## Weekday                   0
## Month_of_Year             0
## Day_of_Month              0
## Scheduled_Departure_Time  0
## Scheduled_Arrival_Time    0
## Marketing_Airline_DOT_ID  0
## Flight_Number             0
## Origin_Airport_ID         0
## Destination_Airport_ID    0
## Flight_Cancelled          0
## Departure_State           0
## Arrival_State             0
## Departure_Delay           0
## Diverted_Airport_Landings 0
## Taxi_Out_Time             0
## Taxi_In_Time              0
## Flight_Diverted           0
## Actual_Departure_Time     0
## Flight_Duration           0
## Flight_Distance           0
## Origin_Temperature        0
## Destination_Temperature   0
## Origin_Wind_Speed         0
## Destination_Wind_Speed    0
## Origin_Precipitation      0
## Destination_Precipitation 0
## Marketing_Airline         0

As we did not use any new techniques worth showing in the test sample, we will now proceed to the next subsection on feature selection.

Correlation matrix

Correlation between explanatory variables

We can see that the most correlated explanatory variables are: Actual_Departure_Time and Scheduled_Departure_Time & Flight_Distance and Flight_Duration (around 95%) - we should remove one per a pair to avoid collinearity.

Also, it worth noting that we should not use all explanatory variables as sometimes they contain the same pieces of information, e.g. Actual_Departure_Time, Scheduled_Departure_Time and Departure_Delay - this will be dealt with later on.

Correlation between explanatory variables and the target variable (quantitative variables)
delays_numeric_vars_order <- 
  delays_correlations[,"Arrival_Delay"] %>% 
  sort(decreasing = TRUE) %>%
  names()

delays_numeric_vars_order
##  [1] "Arrival_Delay"            "Departure_Delay"         
##  [3] "Taxi_Out_Time"            "Actual_Departure_Time"   
##  [5] "Taxi_In_Time"             "Scheduled_Departure_Time"
##  [7] "Scheduled_Arrival_Time"   "Origin_Wind_Speed"       
##  [9] "Flight_Duration"          "Destination_Wind_Speed"  
## [11] "Flight_Distance"          "Origin_Temperature"      
## [13] "Destination_Temperature"  "Flight_Number"
# The least correlated with the target variables:

# Flight_Distance       Origin_Temperature 
# 0.001136347              0.000816758 

# Destination_Temperature     Flight_Number 
# -0.001129782             -0.015140559 

Here we actually managed to get rid of e.g.: Flight_Distance, Origin_Temperature,Destination_Temperature, Flight_Number. These variables were really weakly correlated (around 0%).

ANOVA (qualitative variables)

Since the target variable is quantitative and explanatory are now qualitative, we use the analysis of variance (ANOVA).

load("anova.RData")
print(anova_table)
##                                F_stat   p_value
## Marketing_Airline_DOT_ID  737.8673009 0.0000000
## Marketing_Airline         737.8673009 0.0000000
## Weekday                   662.0246018 0.0000000
## Month_of_Year             349.1456926 0.0000000
## Arrival_State              95.9573380 0.0000000
## Departure_State            92.1726709 0.0000000
## Day_of_Month               87.8649295 0.0000000
## Origin_Airport_ID          19.3641269 0.0000000
## Destination_Airport_ID     18.6977158 0.0000000
## Origin_Precipitation        2.1228864 0.0751445
## Destination_Precipitation   0.8846232 0.4720499
## Flight_Cancelled            0.0000000 0.0000000
## Diverted_Airport_Landings   0.0000000 0.0000000
## Flight_Diverted             0.0000000 0.0000000

We managed to prove that Diverted_Airport_Landings, Flight_Diverted, Flight_Cancelled show really negligible variability. Thus, these variables have no value in prediction so we delete them from our dataset. We will skip the rest of ANOVA code. Of course if we remove the variable in the training set, we do the same for the test one.

delays_train$median_wind_speed <- NULL
delays_train$median_origin_wind_speed <- NULL
delays_train$median_dest_temp <- NULL
delays_train$median_origin_temp <- NULL
delays_train$mean_Flight_Distance <- NULL
delays_train$mean_Flight_Duration <- NULL
delays_train$median_origin_wind_speed.x <- NULL
delays_train$median_origin_wind_speed.y <- NULL

Feature engineering

Let’s check the distribution of the target variable.

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 53544 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

Now we will perform two normality tests:

# Anderson-Darling normality test 
ad_test <- ad.test(delays_train$Arrival_Delay)
print(ad_test)
## 
##  Anderson-Darling normality test
## 
## data:  delays_train$Arrival_Delay
## A = 177768, p-value < 2.2e-16
# Lillieforsa (Kolmogorov-Smirnov) normality test
lillie_test <- lillie.test(delays_train$Arrival_Delay)
print(lillie_test)
## 
##  Lilliefors (Kolmogorov-Smirnov) normality test
## 
## data:  delays_train$Arrival_Delay
## D = 0.23518, p-value < 2.2e-16

Actually, both of them suggest that the distribution of the target variable is not the normal distribution. Let’s also have a look at the QQPlot.

Since we have a lot of negative values we cannot log-transform the data, however we can try the Yeo-Johnson’s transformation.

Arrival_Delay_yeojohnson_values <- yeojohnson(delays_train$Arrival_Delay)
# str(Arrival_Delay_yeojohnson_values)

delays_train$Arrival_Delay_yeojohnson <- Arrival_Delay_yeojohnson_values$x.t

# estimated lambda = 0.7011365 
## Warning: Use of `delays_train$Arrival_Delay_yeojohnson` is discouraged.
## ℹ Use `Arrival_Delay_yeojohnson` instead.
## Use of `delays_train$Arrival_Delay_yeojohnson` is discouraged.
## ℹ Use `Arrival_Delay_yeojohnson` instead.
## Warning: Removed 25 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

Now this is more similar to the normal distribution. Let’s also have a look at the Yeo-Johnson QQPlot.

Finally we perform the same normality tests as previously:

lillie_test_yeojohnson <- lillie.test(delays_train$Arrival_Delay_yeojohnson)
lillie_test_yeojohnson
## 
##  Lilliefors (Kolmogorov-Smirnov) normality test
## 
## data:  delays_train$Arrival_Delay_yeojohnson
## D = 0.052359, p-value < 2.2e-16
ad_test_yeojohnson <- ad.test(delays_train$Arrival_Delay_yeojohnson)

print(ad_test_yeojohnson )
## 
##  Anderson-Darling normality test
## 
## data:  delays_train$Arrival_Delay_yeojohnson
## A = 7521.8, p-value < 2.2e-16
print(ad_test)
## 
##  Anderson-Darling normality test
## 
## data:  delays_train$Arrival_Delay
## A = 177768, p-value < 2.2e-16

Yeo-Johnson transformation has smaller values of statistics 𝐷 and 𝐴 which indicates a better fit to a normal distribution compared to the original Arrival_Delay variable.

Although the p-values are very low in both cases, the smaller values of the statistics for the Yeo-Johnson transformation suggest that it is a better fit to the normal distribution compared to the original data.

Taking all of this into consideration we decide to use Arrival_Delay_yeojohnson as our target variable.

It is also worth mentioning that during the estimation some variables caused serious issues due to small variability, which was not detected before. Thus, we remove them to enable estimation.

# variables that caused serious problems

# 
delays_train_final_kNN <- delays_train_final %>%
  dplyr::select(-Origin_Airport_ID, -Destination_Airport_ID, -Departure_State, -Arrival_State, -Marketing_Airline)

# 
delays_test_final_kNN <- delays_test_final %>%
  dplyr::select(-Origin_Airport_ID, -Destination_Airport_ID, -Departure_State, -Arrival_State, -Marketing_Airline)

#

set.seed(462476)  
delays_train_final <- delays_train_final[sample(nrow(delays_train_final), ceiling(0.05*length(delays_train_final$Weekday))), ]

delays_test_final <- delays_test_final[sample(nrow(delays_test_final), ceiling(0.05*length(delays_test_final$Weekday))), ]

set.seed(462476)  
delays_train_final_kNN <- delays_train_final_kNN[sample(nrow(delays_train_final_kNN), ceiling(0.05*length(delays_train_final_kNN$Weekday))), ]

set.seed(462476)   
delays_test_final_kNN <- delays_test_final_kNN[sample(nrow(delays_test_final_kNN), ceiling(0.05*length(delays_test_final_kNN$Weekday))), ]

Estimation

Due to the large size of the dataset, we opted for training our models on 5% on observations (n = 71001), as it is still a reasonably big number of observations.

#recoding factor variables to dummies 
options(contrasts = c("contr.treatment",  # for non-ordinal factors
                      "contr.treatment")) # for ordinal factors

ctrl_nocv <- trainControl(method = "none")
ctrl_2cv <- trainControl(method = "cv", number=2)
ctrl_3cv <- trainControl(method = "cv", number=3)
ctrl_5cv <- trainControl(method = "cv", number=5)
ctrl_5x3cv <- trainControl(method = "repeatedcv", number = 5, repeats = 3,
                           summaryFunction = defaultSummary)

regressionMetrics <- function(real, predicted) {
  # Mean Square Error
  MSE <- mean((real - predicted)^2)
  # Root Mean Square Error
  RMSE <- sqrt(MSE)
  # Mean Absolute Error
  MAE <- mean(abs(real - predicted))
  # Mean Absolute Percentage Error
  MAPE <- mean(abs(real - predicted)/abs(real))
  # Median Absolute Error
  MedAE <- median(abs(real - predicted))
  # R2
  R2 <- cor(predicted, real)^2
  
  result <- data.frame(MSE, RMSE, MAE, MAPE, MedAE, R2)
  return(result)
} # but we're only interested in MAPE 

Let’s have a look at the models we managed to obtain. We decided not to include kNN and linear SVM due to far worse performance than other methods. Also, we were not able to get SVM poly despite 10h+ of waiting.

setwd("C:\\Users\\Mateusz\\Desktop\\ML I\\Projekt\\Wyniki")
load("reg_model_results.RData")
load("ridge_results.RData")
load("lasso_results.RData")
load("elastic_net_results.RData")
load("delays_svm_radial_results.RData")

Summary table for all models

# Colnames
model_names <- c("Forward", "Backward", "Stepwise", "Ridge", "Lasso", "Elastic_Net", "SVM_Radial")

# Summaries list
summaries <- list(
  forward_model_reg_AIC_summary,
  backward_model_reg_AIC_summary,
  stepwise_model_reg_AIC_summary,
  delays_ridge_summary,
  delays_lasso_summary,
  delays_elastic_net_summary,
  delays_svm_radial_summary
)

# Extracting regression metrics
extract_metrics <- function(summary) {
  MSE <- summary$MSE
  RMSE <- summary$RMSE
  MAE <- summary$MAE
  MAPE <- summary$MAPE
  MedAE <- summary$MedAE
  R2 <- summary$R2
  
  return(data.frame(MSE, RMSE, MAE, MAPE, MedAE, R2))
}

# Creating a results table
results <- data.frame(Model = model_names)

# Loop
for (i in 1:length(summaries)) {
  metrics <- extract_metrics(summaries[[i]])
  results[i, 2:7] <- metrics
}

# Colnames for all metrics
colnames(results) <- c("Model", "MSE", "RMSE", "MAE", "MAPE", "MedAE", "R2")

# Final table
results
##         Model        MSE       RMSE        MAE      MAPE     MedAE        R2
## 1     Forward 0.51756319 0.71941865 0.53905640  8.013296 0.4561047 0.4870323
## 2    Backward 0.51756319 0.71941865 0.53905640  8.013296 0.4561047 0.4870323
## 3    Stepwise 0.51758316 0.71943253 0.53906248 12.632292 0.4561694 0.4870125
## 4       Ridge 0.45718741 0.67615635 0.50640921  9.665456 0.4180403 0.5488564
## 5       Lasso 0.45677731 0.67585302 0.50216075  7.683915 0.4126171 0.5473598
## 6 Elastic_Net 0.45677731 0.67585302 0.50216075  7.683915 0.4126171 0.5473598
## 7  SVM_Radial 0.00952525 0.09759739 0.09622817  3.940444 0.1004303 0.9953002
save(forward_model_reg_AIC_summary, backward_model_reg_AIC_summary,
     stepwise_model_reg_AIC_summary, delays_ridge_summary,
     delays_lasso_summary, delays_elastic_net_summary, 
     delays_svm_radial_summary, file = "Model_summaries.RData")

Final prediction

We can see that basically all models have decent MAPE values as models with MAPE < 10% are considered really good. It is worth looking at R2 too. We can see that for SVM Radial it is close to 1, which strongly suggests overfitting - the model would be perfect within the sample, but forecasts out of sample are expected to be bad in this case.

Taking that into consideration, the best model turns out to be elastic net. It has the lowest MAPE value - 7.68%, meaning that on average, the forecasted values deviate from the actual values by 7.68%. The R2 of 54.74% is the second best, only slightly smaller than the best one (excluding SVM Radial). It is worth noting that the optimal elastic net solution was alpha = 1, which is tantamount to lasso regression.

final_prediction <- predict(delays_elastic_net, delays_test_final_kNN)

ID <- seq_len(nrow(delays_test_final_kNN))

predictions_df <- data.frame(ID = ID, Prediction = final_prediction)

write.csv(predictions_df, file = "final_prediction_regression.csv", row.names = FALSE)