Data-Driven Example on using dplyr: Analysis on Airplane Delay
1 Introduction
This is a quick example on how to use dplyer
. Suppose you are given a flight duration data,in the following format
library(readr)
library(tidyverse)
library(ggplot2)
library(dplyr)
flight_data <- read_csv("D:/newsite/My_Website/static/data/flight.data.csv")
head(flight_data)
## # A tibble: 6 x 8
## X1 origin dest carrier sched_dep_time dep_time sched_arr_time arr_time
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1 LGA IAH UA 1859 1900 2159 2204
## 2 2 JFK BOS DL 1455 1452 1613 1553
## 3 3 EWR DEN WN 1220 1228 1450 1426
## 4 4 EWR TPA UA 1422 1417 1715 1714
## 5 5 LGA DTW MQ 1315 1305 1520 1459
## 6 6 JFK FLL DL 2025 2020 2342 2312
Now, you are wondering what is the the on–time departure percentage by carrier, and the flight durations. Notice, you must make a correction when the destination airport is in a time zone that is not thesame as the origin airport.
Therefore, for getting the correct flight duration, we need some extra information, Here, you would be find enough information for the most airport. Let’s download it and see.
airports <-read_csv("D:/newsite/My_Website/static/data/airports.csv")
head(airports)
## # A tibble: 6 x 14
## `1` `Goroka Airport` Goroka `Papua New Guin~ GKA AYGA `-6.08168983459~
## <dbl> <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 2 Madang Airport Madang Papua New Guinea MAG AYMD -5.21
## 2 3 Mount Hagen Kag~ Mount~ Papua New Guinea HGU AYMH -5.83
## 3 4 Nadzab Airport Nadzab Papua New Guinea LAE AYNZ -6.57
## 4 5 Port Moresby Ja~ Port ~ Papua New Guinea POM AYPY -9.44
## 5 6 Wewak Internati~ Wewak Papua New Guinea WWK AYWK -3.58
## 6 7 Narsarsuaq Airp~ Narss~ Greenland UAK BGBW 61.2
## # ... with 7 more variables: `145.391998291` <dbl>, `5282` <dbl>, `10` <dbl>,
## # U <chr>, `Pacific/Port_Moresby` <chr>, airport <chr>, OurAirports <chr>
2 Data Cleaning
Obviously, the data needs to be refined and combined. Here is the process:
colnames(airports) <- c(
"Airport_ID",
"Name",
"City",
"Country",
"IATA",
"ICAO",
"Latitude",
"Longitude",
"Altitude",
"Timezone",
"DST",
"Tz_database_time_zone",
"Type",
"Source"
)
airports$Airport_ID = airports$Airport_ID - 1
flight_new = mutate(
flight_data,
dep_hour = flight_data$dep_time %/% 100,
dep_mins = flight_data$dep_time %% 100,
arrive_hour = flight_data$arr_time %/% 100,
arrive_mins = flight_data$arr_time %% 100
)
final1 = left_join(flight_new, airports, by = c("origin" = "IATA"))
final2 = left_join(final1, airports, by = c("dest" = "IATA"))
## The function will help to calculate the duration time
flight_time_calculator <-
function(dep.hour,
dep.mins,
dep.UTC,
arr.hour,
arr.mins,
arr.UTC) {
input = c(dep.hour, dep.mins, dep.UTC, arr.hour, arr.mins, arr.UTC)
if (any(is.na(input) == TRUE)) {
duration = c(NA, NA)
}
else{
dep_hour_UTC = dep.hour - dep.UTC
arr_hour_UTC = arr.hour - arr.UTC
if (arr_hour_UTC < dep_hour_UTC) {
hour_final = arr_hour_UTC + 24 - dep_hour_UTC
}
else{
hour_final = arr_hour_UTC - dep_hour_UTC
}
mins_final = arr.mins - dep.mins
if (mins_final < 0) {
hour_final = (hour_final * 60 + mins_final) %/% 60
mins_final = (hour_final * 60 + mins_final) %% 60
}
duration = c(hour_final, mins_final)
}
return(duration)
}
final2$duration = rep(0, length(final2$X1))
for (i in 1:length(final2$X1)) {
duration = flight_time_calculator(
final2$dep_hour[i],
final2$dep_mins[i],
final2$Timezone.x[i],
final2$arrive_hour[i],
final2$arrive_mins[i],
final2$Timezone.y[i]
)
final2$duration[i] = duration[1] * 60 + duration[2]
}
## see the duration in the last column
head(final2)
## # A tibble: 6 x 39
## X1 origin dest carrier sched_dep_time dep_time sched_arr_time arr_time
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1 LGA IAH UA 1859 1900 2159 2204
## 2 2 JFK BOS DL 1455 1452 1613 1553
## 3 3 EWR DEN WN 1220 1228 1450 1426
## 4 4 EWR TPA UA 1422 1417 1715 1714
## 5 5 LGA DTW MQ 1315 1305 1520 1459
## 6 6 JFK FLL DL 2025 2020 2342 2312
## # ... with 31 more variables: dep_hour <dbl>, dep_mins <dbl>,
## # arrive_hour <dbl>, arrive_mins <dbl>, Airport_ID.x <dbl>, Name.x <chr>,
## # City.x <chr>, Country.x <chr>, ICAO.x <chr>, Latitude.x <dbl>,
## # Longitude.x <dbl>, Altitude.x <dbl>, Timezone.x <dbl>, DST.x <chr>,
## # Tz_database_time_zone.x <chr>, Type.x <chr>, Source.x <chr>,
## # Airport_ID.y <dbl>, Name.y <chr>, City.y <chr>, Country.y <chr>,
## # ICAO.y <chr>, Latitude.y <dbl>, Longitude.y <dbl>, Altitude.y <dbl>,
## # Timezone.y <dbl>, DST.y <chr>, Tz_database_time_zone.y <chr>, Type.y <chr>,
## # Source.y <chr>, duration <dbl>
3 Results
Hence, we can know the durations of the five longest flights are:
final2_sort = final2[with(final2, order(-final2$duration)),]
print(final2_sort[1:5, c(2:3,39)])
## # A tibble: 5 x 3
## origin dest duration
## <chr> <chr> <dbl>
## 1 JFK HNL 695
## 2 JFK HNL 693
## 3 JFK HNL 673
## 4 JFK HNL 604
## 5 EWR HNL 577
As you can see the flights are all to Hawii. Now let’s find out which flight company has the highest on-time flight rate
z1 = group_by(final2_sort,carrier)
(per_carrier =summarise(z1,on_time_rate=mean(1 * (dep_time - sched_dep_time < 15),na.rm = TRUE)))
## # A tibble: 15 x 2
## carrier on_time_rate
## <chr> <dbl>
## 1 9E 0.754
## 2 AA 0.868
## 3 AS 0.667
## 4 B6 0.771
## 5 DL 0.780
## 6 EV 0.599
## 7 F9 0.5
## 8 FL 0.75
## 9 HA 1
## 10 MQ 0.740
## 11 UA 0.665
## 12 US 0.875
## 13 VX 0.556
## 14 WN 0.778
## 15 YV 1
p = ggplot(per_carrier,
aes(
x = reorder(per_carrier$carrier,-per_carrier$on_time_rate),
y = per_carrier$on_time_rate,
)) + geom_bar(stat="identity")
p +ggtitle(" Flight On-time Status") + labs(y = "On Time Rate",x= "Carrier")