Cleaning

Set up

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(plotly)

Attaching package: 'plotly'

The following object is masked from 'package:ggplot2':

    last_plot

The following object is masked from 'package:stats':

    filter

The following object is masked from 'package:graphics':

    layout

1. Main Dataset

Data import

df <- read_csv("ev2024.csv") |>
  glimpse()
Rows: 12654 Columns: 8
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (6): region, category, parameter, mode, powertrain, unit
dbl (2): year, value

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Rows: 12,654
Columns: 8
$ region     <chr> "Australia", "Australia", "Australia", "Australia", "Austra…
$ category   <chr> "Historical", "Historical", "Historical", "Historical", "Hi…
$ parameter  <chr> "EV stock share", "EV sales share", "EV sales", "EV stock",…
$ mode       <chr> "Cars", "Cars", "Cars", "Cars", "Cars", "Cars", "Cars", "Ca…
$ powertrain <chr> "EV", "EV", "BEV", "BEV", "BEV", "BEV", "EV", "EV", "PHEV",…
$ year       <dbl> 2011, 2011, 2011, 2011, 2012, 2012, 2012, 2012, 2012, 2012,…
$ unit       <chr> "percent", "percent", "Vehicles", "Vehicles", "Vehicles", "…
$ value      <dbl> 3.9e-04, 6.5e-03, 4.9e+01, 4.9e+01, 2.2e+02, 1.7e+02, 3.0e-…

Checking for missing values

# check missing values
df |>
  summarise_all(~sum(is.na(.)))
# A tibble: 1 × 8
  region category parameter  mode powertrain  year  unit value
   <int>    <int>     <int> <int>      <int> <int> <int> <int>
1      0        0         0     0          0     0     0     0

Data cleaning

Varieble explanation

Region

unique(df$region)
 [1] "Australia"            "Austria"              "Belgium"             
 [4] "Brazil"               "Bulgaria"             "Canada"              
 [7] "Chile"                "China"                "Colombia"            
[10] "Costa Rica"           "Croatia"              "Cyprus"              
[13] "Czech Republic"       "Denmark"              "Estonia"             
[16] "EU27"                 "Europe"               "Finland"             
[19] "France"               "Germany"              "Greece"              
[22] "Hungary"              "Iceland"              "India"               
[25] "Indonesia"            "Ireland"              "Israel"              
[28] "Italy"                "Japan"                "Korea"               
[31] "Latvia"               "Lithuania"            "Luxembourg"          
[34] "Mexico"               "Netherlands"          "New Zealand"         
[37] "Norway"               "Poland"               "Portugal"            
[40] "Rest of the world"    "Romania"              "Seychelles"          
[43] "Slovakia"             "Slovenia"             "South Africa"        
[46] "Spain"                "Sweden"               "Switzerland"         
[49] "Thailand"             "Turkiye"              "United Arab Emirates"
[52] "United Kingdom"       "USA"                  "World"               
unique(df$region)
 [1] "Australia"            "Austria"              "Belgium"             
 [4] "Brazil"               "Bulgaria"             "Canada"              
 [7] "Chile"                "China"                "Colombia"            
[10] "Costa Rica"           "Croatia"              "Cyprus"              
[13] "Czech Republic"       "Denmark"              "Estonia"             
[16] "EU27"                 "Europe"               "Finland"             
[19] "France"               "Germany"              "Greece"              
[22] "Hungary"              "Iceland"              "India"               
[25] "Indonesia"            "Ireland"              "Israel"              
[28] "Italy"                "Japan"                "Korea"               
[31] "Latvia"               "Lithuania"            "Luxembourg"          
[34] "Mexico"               "Netherlands"          "New Zealand"         
[37] "Norway"               "Poland"               "Portugal"            
[40] "Rest of the world"    "Romania"              "Seychelles"          
[43] "Slovakia"             "Slovenia"             "South Africa"        
[46] "Spain"                "Sweden"               "Switzerland"         
[49] "Thailand"             "Turkiye"              "United Arab Emirates"
[52] "United Kingdom"       "USA"                  "World"               

Category

# Categorizes the data as "Historical" or "Projected," indicating whether it reflects past data or future projections.
unique(df$category)
[1] "Historical"       "Projection-STEPS" "Projection-APS"  

Parameter

# Specifies the metric being measured, such as "EV sales," "EV stock," or "charging points," allowing us to focus on specific aspects of the EV market.
unique(df$parameter)
[1] "EV stock share"                "EV sales share"               
[3] "EV sales"                      "EV stock"                     
[5] "EV charging points"            "Electricity demand"           
[7] "Oil displacement Mbd"          "Oil displacement, million lge"

Mode

# Identifies the type of vehicle, encompassing "cars," "buses," "trucks," "vans," "two/three-wheelers," offering insights into the adoption of EVs in different vehicle segments.
unique(df$mode)
[1] "Cars"   "EV"     "Buses"  "Vans"   "Trucks"

Powertain

# Indicates the specific type of drivetrain technology, reflecting the diverse powertrain options in the EV market.
unique(df$powertrain)
[1] "EV"                      "BEV"                    
[3] "PHEV"                    "Publicly available fast"
[5] "Publicly available slow" "FCEV"                   

BEVs: Battery electric vehicles.
PHEVs: Plug-in hybrid electric vehicles.
FCEVs: Fuel cell electric vehicles.
EVs: All electric vehicles (BEVs + PHEVs).

Year

# Provides the year of the data point, enabling tracking of EV trends and growth over time.
unique(df$year)
 [1] 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2010 2025
[16] 2030 2035

Unit

# Specifies the unit of measurement for the corresponding value, indicating the variety of metrics being analyzed.
unique(df$unit)
[1] "percent"                       "Vehicles"                     
[3] "charging points"               "GWh"                          
[5] "Milion barrels per day"        "Oil displacement, million lge"

So the dataset is clean enough with no missing value, we can directly use it to analyse by using “filter” function.

Save the cleaned data

# save the df_cleaned to RData file
save(df, file = "df.RData")

2.Supplement Data

Data Structure Clarification

The dataset provides EV sales share statistics segmented by vehicle mode (Cars, Buses, Trucks, and Vans) rather than reporting aggregate national-level EV sales share. For accurate market analysis, I will directly use the official sector-disaggregated data rather than constructing synthetic totals that might result in accuracy.

ev_share <- read.csv("ev_sales_share.csv")
ev_share_clean <- ev_share |>
  rename("year" = "Year",
         "region" = "Entity",
         "code" = "Code",
         "share" = "Share.of.new.cars.that.are.electric") |> 
  select(year, region, share) |>
  glimpse()
Rows: 468
Columns: 3
$ year   <int> 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 202…
$ region <chr> "Australia", "Australia", "Australia", "Australia", "Australia"…
$ share  <dbl> 0.0065, 0.0300, 0.0340, 0.1600, 0.2000, 0.1500, 0.2600, 0.4200,…
# check missing values
ev_share_clean |>
  summarise_all(~sum(is.na(.)))
  year region share
1    0      0     0
# Save cleaned data "df_clean" to .RData
save(ev_share_clean, file = "cleaned_data.RData")

write.csv(ev_share_clean, "ev_share_clean.csv")

unique(ev_share_clean$region)
 [1] "Australia"           "Austria"             "Belgium"            
 [4] "Brazil"              "Canada"              "Chile"              
 [7] "China"               "Denmark"             "Europe"             
[10] "European Union (27)" "Finland"             "France"             
[13] "Germany"             "Greece"              "Iceland"            
[16] "India"               "Israel"              "Italy"              
[19] "Japan"               "Mexico"              "Netherlands"        
[22] "New Zealand"         "Norway"              "Poland"             
[25] "Portugal"            "Rest of World"       "South Africa"       
[28] "South Korea"         "Spain"               "Sweden"             
[31] "Switzerland"         "Turkey"              "United Kingdom"     
[34] "United States"       "World"              
Back to top