# Data manipulation

import pandas as pd
import numpy as np

# Data visualization

import matplotlib.pyplot as plt
import seaborn as sns

# Statistical analysis

from scipy import stats

# Display settings

pd.set_option('display.max_columns', None)

df = pd.read_csv(r"C:\Users\13015\Desktop\Airline Project\Airline_Delay_Dataset.csv")

print("Dataset loaded successfully.")

Dataset loaded successfully.

print("Dataset Shape:", df.shape)

Dataset Shape: (92477, 19)

df.head()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92477 entries, 0 to 92476
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   year                 92477 non-null  int64  
 1   month                92477 non-null  int64  
 2   carrier              92477 non-null  object 
 3   airport              92477 non-null  object 
 4   arr_flights          92326 non-null  float64
 5   arr_del15            92139 non-null  float64
 6   carrier_ct           92326 non-null  float64
 7   weather_ct           92326 non-null  float64
 8   nas_ct               92326 non-null  float64
 9   security_ct          92326 non-null  float64
 10  late_aircraft_ct     92326 non-null  float64
 11  arr_cancelled        92326 non-null  float64
 12  arr_diverted         92326 non-null  float64
 13  arr_delay            92326 non-null  float64
 14  carrier_delay        92326 non-null  float64
 15  weather_delay        92326 non-null  float64
 16  nas_delay            92326 non-null  float64
 17  security_delay       92326 non-null  float64
 18  late_aircraft_delay  92326 non-null  float64
dtypes: float64(15), int64(2), object(2)
memory usage: 13.4+ MB

df.describe()

missing_values = df.isnull().sum()

missing_values = missing_values[missing_values > 0].sort_values(ascending=False)

missing_values

arr_del15              338
arr_flights            151
carrier_ct             151
weather_ct             151
nas_ct                 151
security_ct            151
late_aircraft_ct       151
arr_cancelled          151
arr_diverted           151
arr_delay              151
carrier_delay          151
weather_delay          151
nas_delay              151
security_delay         151
late_aircraft_delay    151
dtype: int64

duplicate_count = df.duplicated().sum()

print("Number of duplicate rows:", duplicate_count)

Number of duplicate rows: 0

# Fill missing numerical values with median values

numeric_cols = df.select_dtypes(include='number').columns

df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

df.isnull().sum()

year                   0
month                  0
carrier                0
airport                0
arr_flights            0
arr_del15              0
carrier_ct             0
weather_ct             0
nas_ct                 0
security_ct            0
late_aircraft_ct       0
arr_cancelled          0
arr_diverted           0
arr_delay              0
carrier_delay          0
weather_delay          0
nas_delay              0
security_delay         0
late_aircraft_delay    0
dtype: int64

# Remove duplicate rows

df.drop_duplicates(inplace=True)

print("Remaining duplicate rows:", df.duplicated().sum())

Remaining duplicate rows: 0

# Standardize carrier and airport columns

df['carrier'] = df['carrier'].str.strip().str.upper()

df['airport'] = df['airport'].str.strip().str.upper()

print(df['carrier'].unique()[:10])

print(df['airport'].unique()[:10])

['CE-000' 'CE-001' 'CE-002' 'CE-003' 'CE-004' 'CE-005' 'CE-006' 'CE-007'
 'CE-008' 'CE-009']
['AT-00' 'AT-01' 'AT-02' 'AT-03' 'AT-04' 'AT-05' 'AT-06' 'AT-07' 'AT-08'
 'AT-09']

# Feature 1 - Delay Rate

df['delay_rate'] = df['arr_del15'] / df['arr_flights']

# Feature 2 - Cancellation Rate

df['cancel_rate'] = df['arr_cancelled'] / df['arr_flights']

df[['delay_rate', 'cancel_rate']].head()

plt.figure(figsize=(10, 6))

sns.histplot(df['arr_del15'], bins=30)

plt.title('Distribution of Delayed Flights')
plt.xlabel('Number of Delayed Flights')
plt.ylabel('Frequency')

plt.show()

carrier_delay = df.groupby('carrier')['delay_rate'].mean().sort_values(ascending=False)

plt.figure(figsize=(12, 6))

carrier_delay.plot(kind='bar')

plt.title('Average Delay Rate by Carrier')
plt.xlabel('Carrier')
plt.ylabel('Average Delay Rate')

plt.xticks(rotation=45)

plt.show()

airport_delay = (
    df.groupby('airport')['delay_rate']
    .mean()
    .sort_values(ascending=False)
    .head(15)
)

plt.figure(figsize=(12, 6))

airport_delay.plot(kind='bar')

plt.title('Top 15 Airports by Average Delay Rate')
plt.xlabel('Airport')
plt.ylabel('Average Delay Rate')

plt.xticks(rotation=45)

plt.show()

monthly_delay = df.groupby('month')['delay_rate'].mean()

plt.figure(figsize=(10, 6))

sns.lineplot(x=monthly_delay.index, y=monthly_delay.values, marker='o')

plt.title('Average Delay Rate by Month')
plt.xlabel('Month')
plt.ylabel('Average Delay Rate')

plt.show()

plt.figure(figsize=(10, 6))

sns.scatterplot(
    x=df['arr_flights'],
    y=df['delay_rate']
)

plt.title('Flight Volume vs Delay Rate')
plt.xlabel('Arriving Flights')
plt.ylabel('Delay Rate')

plt.show()

plt.figure(figsize=(10, 6))

sns.histplot(df['arr_cancelled'], bins=30)

plt.title('Distribution of Cancelled Flights')
plt.xlabel('Cancelled Flights')
plt.ylabel('Frequency')

plt.show()

# Select delay rates for two airlines

carrier_A = df[df['carrier'] == 'CE-011']['delay_rate']

carrier_B = df[df['carrier'] == 'CE-008']['delay_rate']

# Check sample sizes

print("CE-011 Sample Size:", len(carrier_A))
print("CE-008 Sample Size:", len(carrier_B))

# Perform independent t-test

t_stat, p_value = stats.ttest_ind(
    carrier_A,
    carrier_B,
    nan_policy='omit'
)

print("T-Statistic:", t_stat)
print("P-Value:", p_value)

print(f"Correlation Coefficient: {correlation:.4f}")

CE-011 Sample Size: 13213
CE-008 Sample Size: 7794
T-Statistic: -0.29026177522588065
P-Value: 0.7716188448923008
Correlation Coefficient: 0.3306

# Create monthly delay rate groups

monthly_groups = [
    group['delay_rate'].dropna()
    for name, group in df.groupby('month')
]

# Perform one-way ANOVA

f_stat, p_value = stats.f_oneway(*monthly_groups)

print("F-Statistic:", f_stat)
print("P-Value:", p_value)

print(f"Correlation Coefficient: {correlation:.4f}")

print(f"P-Value: {p_value:.10f}")

F-Statistic: 31.77866848118392
P-Value: 4.096003595164069e-68
Correlation Coefficient: 0.3306
P-Value: 0.0000000000

# Perform Pearson correlation analysis

correlation, p_value = stats.pearsonr(
    df['arr_flights'],
    df['delay_rate']
)

print("Correlation Coefficient:", correlation)
print("P-Value:", p_value)

print(f"Correlation Coefficient: {correlation:.4f}")

print(f"P-Value: {p_value:.10f}")

Correlation Coefficient: 0.02477127084000443
P-Value: 4.920711596730198e-14
Correlation Coefficient: 0.0248
P-Value: 0.0000000000

# Perform Pearson correlation analysis

correlation, p_value = stats.pearsonr(
    df['cancel_rate'],
    df['delay_rate']
)

print("Correlation Coefficient:", correlation)
print("P-Value:", p_value)

print(f"P-Value: {p_value:.10f}")

Correlation Coefficient: 0.33063475412487775
P-Value: 0.0
P-Value: 0.0000000000

	year	month	carrier	airport	arr_flights	arr_del15	carrier_ct	weather_ct	nas_ct	late_aircraft_ct	arr_cancelled	arr_diverted	arr_delay	carrier_delay	weather_delay	nas_delay	late_aircraft_delay
0	2023	8	CE-000	AT-00	89.0	13.0	2.25	1.60	3.16	5.99	2.0	1.0	1375.0	71.0	761.0	118.0	425.0
1	2023	8	CE-000	AT-01	62.0	10.0	1.97	0.04	0.57	7.42	0.0	1.0	799.0	218.0	1.0	62.0	518.0
2	2023	8	CE-000	AT-02	62.0	10.0	2.73	1.18	1.80	4.28	1.0	0.0	766.0	56.0	188.0	78.0	444.0
3	2023	8	CE-000	AT-03	66.0	12.0	3.69	2.27	4.47	1.57	1.0	1.0	1397.0	471.0	320.0	388.0	218.0
4	2023	8	CE-000	AT-04	92.0	22.0	7.76	0.00	2.96	11.28	2.0	0.0	1530.0	628.0	0.0	134.0	768.0

	year	month	arr_flights	arr_del15	carrier_ct	weather_ct	nas_ct	security_ct	late_aircraft_ct	arr_cancelled	arr_diverted	arr_delay	carrier_delay	weather_delay	nas_delay	security_delay	late_aircraft_delay
count	92477.000000	92477.000000	92326.000000	92139.000000	92326.000000	92326.000000	92326.000000	92326.000000	92326.000000	92326.000000	92326.000000	92326.000000	92326.000000	92326.000000	92326.000000	92326.000000	92326.000000
mean	2020.830661	6.227246	286.671393	56.781797	19.492678	2.140265	15.418681	0.175466	19.439746	8.404144	0.749659	3839.853790	1413.232513	225.316845	745.969077	8.286290	1447.043336
std	1.355159	3.397030	910.705200	164.614120	51.277935	7.486700	52.716249	0.825821	64.820558	53.407028	3.413909	12420.268609	4498.278697	877.550153	2980.517129	47.665554	5121.911398
min	2019.000000	1.000000	-21194.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	2020.000000	3.000000	31.000000	5.000000	1.910000	0.000000	0.620000	0.000000	0.940000	0.000000	0.000000	256.000000	84.250000	0.000000	18.000000	0.000000	34.000000
50%	2021.000000	6.000000	82.000000	14.000000	5.310000	0.270000	2.940000	0.000000	3.650000	1.000000	0.000000	837.000000	312.000000	14.000000	108.000000	0.000000	231.000000
75%	2022.000000	9.000000	202.000000	38.000000	15.000000	1.750000	8.970000	0.000000	11.820000	4.000000	0.000000	2462.000000	1005.000000	138.000000	369.000000	0.000000	871.000000
max	2023.000000	12.000000	21873.000000	4142.000000	1293.910000	266.420000	1485.820000	58.690000	2069.070000	4951.000000	154.000000	438783.000000	162563.000000	27876.000000	97283.000000	3760.000000	227959.000000

	delay_rate	cancel_rate
0	0.146067	0.022472
1	0.161290	0.000000
2	0.161290	0.016129
3	0.181818	0.015152
4	0.239130	0.021739

Table of Contents¶

✈️ Airline Delay Analysis¶

1. Introduction¶

📌 Problem Statement¶

🎯 Objective¶

❓ Key Questions¶

📊 Approach¶

💼 Why This Matters¶

2. Data Overview¶

2.1 Import Libraries¶

2.2 Load Dataset¶

2.3 Dataset Shape¶

2.4 Preview of Dataset¶

2.5 Column Information¶

2.6 Statistical Summary¶

2.7 Missing Values Check¶

2.8 Duplicate Values Check¶

2.1 Import Libraries¶

2.2 Load Dataset¶

2.3 Dataset Shape¶

2.4 Preview of Dataset¶

2.5 Column Information¶

2.6 Statistical Summary¶

2.7 Missing Values Check¶

2.8 Duplicate Values Check¶

3. Data Cleaning & Preprocessing¶

3.1 Handle Missing Values¶

3.2 Remove Duplicate Records¶

3.3 Standardize Categorical Data¶

3.4 Feature Engineering¶

3.1 Handle Missing Values¶

3.2 Remove Duplicate Records¶

3.3 Standardize Categorical Data¶

3.4 Feature Engineering¶

4. Exploratory Data Analysis¶

4.1 Distribution of Flight Delays¶

4.2 Airlines with Highest Delay Rates¶

4.3 Airports with Highest Delay Rates¶

4.4 Monthly Delay Trends¶

4.5 Relationship Between Flight Volume and Delays¶

4.6 Cancellation Analysis¶

4.1 Distribution of Flight Delays¶

4.2 Airlines with Highest Delay Rates¶

4.3 Airports with Highest Delay Rates¶

4.4 Monthly Delay Trends¶

4.5 Relationship Between Flight Volume and Delays¶

4.6 Cancellation Analysis¶

5. Hypothesis Testing¶

5.1 Airline Delay Rate Comparison¶

5.2 Monthly Delay Variation¶

5.3 Flight Volume and Delay Relationship¶

5.4 Cancellation Rate and Delay Relationship¶

5.1 Airline Delay Rate Comparison¶

Business Question¶

Null Hypothesis (H₀)¶

Alternative Hypothesis (H₁)¶

An independent t-test will be used to compare the average delay rates between two airline carriers.¶

5.2 Monthly Delay Variation¶

Business Question¶

Null Hypothesis (H₀)¶

Alternative Hypothesis (H₁)¶

A one-way ANOVA test is used to determine whether average delay rates differ significantly across multiple months.¶

5.3 Flight Volume and Delay Relationship¶

Business Question¶

Null Hypothesis (H₀)¶

Alternative Hypothesis (H₁)¶

Pearson correlation analysis is used to measure the strength and direction of the relationship between arriving flight volume and delay rates.¶

5.4 Cancellation Rate and Delay Relationship¶

Business Question¶

Null Hypothesis (H₀)¶

Alternative Hypothesis (H₁)¶

Pearson correlation analysis is used to measure the strength and direction of the relationship between cancellation rates and delay rates.¶

6. Key Insights & Recommendations¶

6.1 Key Findings¶

6.2 Business Recommendations¶

6.3 Operational Implications¶

6.1 Key Findings¶

6.2 Business Recommendations¶

6.3 Operational Implications¶

7. Conclusion¶