COVID-19 Exploratory Data Analysis

Introduction

This is a personal Rmarkdown document I have created to visualize the COVID-19 updates and some preliminary exploratory data analysis (EDA). The source of this data is the github repository created and maintained by the Coronavirus COVID-19 Global Cases by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University (JHU).

suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(forecast))
suppressPackageStartupMessages(library(zoo))
suppressPackageStartupMessages(library(xts))
suppressPackageStartupMessages(library(gridExtra))
suppressPackageStartupMessages(library(gghighlight))
suppressPackageStartupMessages(library(knitr))
suppressPackageStartupMessages(library(directlabels))
suppressPackageStartupMessages(library(scales))
suppressPackageStartupMessages(library(plotly))
#suppressPackageStartupMessages(library(rjson))

Reading the data

COVID_confirmed_global_raw <- read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv") 
COVID_deaths_global_raw <-  read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv") 
COVID_recovered_global_raw <-  read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv") 

Reshaping and formatting data

# Reshape to longer format
COVID_confirmed_global_longer <- COVID_confirmed_global_raw %>% 
  pivot_longer(cols      = c('1/22/20':names(COVID_confirmed_global_raw)[ncol(COVID_confirmed_global_raw)]),
               names_to  = "date",
               values_to = "n_cases")
  

COVID_deaths_global_longer <- COVID_deaths_global_raw %>% 
  pivot_longer(cols      = c('1/22/20':names(COVID_deaths_global_raw)[ncol(COVID_deaths_global_raw)]),
               names_to  = "date",
               values_to = "n_cases")

COVID_recovered_global_longer <- COVID_recovered_global_raw %>% 
  pivot_longer(cols      = c('1/22/20':names(COVID_recovered_global_raw)[ncol(COVID_recovered_global_raw)]),
               names_to  = "date",
               values_to = "n_cases")


# change column names
colnames(COVID_confirmed_global_longer) <-  c('state', 'country', 'lat', 'long','date', 'n_cases')
colnames(COVID_deaths_global_longer) <-  c('state', 'country', 'lat', 'long','date', 'n_cases')
colnames(COVID_recovered_global_longer) <-  c('state', 'country', 'lat', 'long','date', 'n_cases')

# drop `state` column and create a `new_cases` column
COVID_confirmed_global_longer <- COVID_confirmed_global_longer %>% 
               select(-state)%>% 
               group_by(country, date) %>% 
               summarize(n_cases = sum(n_cases))  
## `summarise()` has grouped output by 'country'. You can override using the `.groups` argument.
COVID_deaths_global_longer <- COVID_deaths_global_longer %>% 
               select(-state)%>% 
               group_by(country, date) %>% 
               summarize(n_cases = sum(n_cases)) 
## `summarise()` has grouped output by 'country'. You can override using the `.groups` argument.
COVID_recovered_global_longer <- COVID_recovered_global_longer %>% 
               select(-state) %>% 
               group_by(country, date) %>% 
               summarize(n_cases = sum(n_cases)) 
## `summarise()` has grouped output by 'country'. You can override using the `.groups` argument.
# convert date columns from character to date format
COVID_confirmed_global_longer$date <- as.Date(COVID_confirmed_global_longer$date, format = '%m/%d/%Y')
COVID_deaths_global_longer$date <- as.Date(COVID_deaths_global_longer$date, format = '%m/%d/%Y')
COVID_recovered_global_longer$date <- as.Date(COVID_recovered_global_longer$date, format = '%m/%d/%Y')

COVID_confirmed_global_longer <- COVID_confirmed_global_longer %>% 
  arrange(country, date) %>% 
  mutate(new_cases = n_cases-lag(n_cases, default = 0))

COVID_deaths_global_longer <- COVID_deaths_global_longer %>% 
  arrange(country, date) %>% 
  mutate(new_cases = n_cases-lag(n_cases, default = 0))

COVID_recovered_global_longer <- COVID_recovered_global_longer %>% 
  arrange(country, date) %>% 
  mutate(new_cases = n_cases-lag(n_cases, default = 0))

Let’s look at the current data format

knitr::kable(head(COVID_confirmed_global_longer),format = 'markdown')
country date n_cases new_cases
Afghanistan 20-01-22 0 0
Afghanistan 20-01-23 0 0
Afghanistan 20-01-24 0 0
Afghanistan 20-01-25 0 0
Afghanistan 20-01-26 0 0
Afghanistan 20-01-27 0 0

creating some functions for quick stats

world_summary <- function() {
   
  df1 <- COVID_confirmed_global_longer %>% 
    group_by(country) %>% 
    summarize(n_cases_today = max(n_cases),
              new_cases_today = dplyr::last(new_cases)) %>%
    ungroup() %>% 
    summarize(n_cases_total = sum(n_cases_today),
              new_cases_total = sum(new_cases_today))
  
  df2 <- COVID_deaths_global_longer %>% 
    group_by(country) %>% 
    summarize(n_cases_today = max(n_cases),
              new_cases_today = dplyr::last(new_cases)) %>% 
    ungroup() %>% 
    summarize(n_cases_total = sum(n_cases_today),
              new_cases_total = sum(new_cases_today))
  
  df3 <- COVID_recovered_global_longer %>%
    group_by(country) %>% 
    summarize(n_cases_today = max(n_cases),
              new_cases_today = dplyr::last(new_cases)) %>% 
    ungroup() %>% 
    summarize(n_cases_total = sum(n_cases_today),
              new_cases_total = sum(new_cases_today))
  
  print(paste0("number of total confirmed cases in the world as of today:  ", df1$n_cases_total, " with ", df1$new_cases_total, " new cases"))
  print(paste0("number of total deaths in the world  as of today:  ", df2$n_cases_total, " with ", df2$new_cases_total, " new deaths"))
  print(paste0("number of total recovered cases in the world  as of today:  ", df3$n_cases_total, " with ", df3$new_cases_total, " new cases"))
  
}


country_summary <- function(country1) {
  
  df1 <- COVID_confirmed_global_longer %>% group_by(country) %>% dplyr::filter(country==country1) %>%  summarize(n_cases_today = max(n_cases),
              new_cases_today = dplyr::last(new_cases))
  df2 <- COVID_deaths_global_longer %>% group_by(country)%>% dplyr::filter(country==country1) %>% summarize(n_cases_today = max(n_cases),
              new_cases_today = dplyr::last(new_cases))
  df3 <- COVID_recovered_global_longer %>% group_by(country)%>% dplyr::filter(country==country1) %>% summarize(n_cases_today = max(n_cases),
              new_cases_today = dplyr::last(new_cases))
  #                                                            
  print(paste0("number of confirmed cases in ", country1, " as of today:  ", df1$n_cases_today, " with ", df1$new_cases_today, " new cases"))
  # df1$n_cases_today

  print(paste0("number of deaths in ", country1, " as of today:  ", df2$n_cases_today, " with ", df2$new_cases_today, " new deaths"))
  # df2$n_cases_today

  print(paste0("number of recovered cases in ", country1, " as of today:  ", df3$n_cases_today, " with ", df3$new_cases_today, " new cases"))
  # df3$n_cases_today
  
}


world_summary()
## [1] "number of total confirmed cases in the world as of today:  227649349 with 593099 new cases"
## [1] "number of total deaths in the world  as of today:  4679139 with 8881 new deaths"
## [1] "number of total recovered cases in the world  as of today:  137249983 with 0 new cases"
country_summary("US")
## [1] "number of confirmed cases in US as of today:  41993789 with 207886 new cases"
## [1] "number of deaths in US as of today:  672635 with 2635 new deaths"
## [1] "number of recovered cases in US as of today:  6298082 with 0 new cases"
country_summary("Italy")
## [1] "number of confirmed cases in Italy as of today:  4627699 with 4544 new cases"
## [1] "number of deaths in Italy as of today:  130233 with 66 new deaths"
## [1] "number of recovered cases in Italy as of today:  4144608 with 0 new cases"
country_summary("Spain")
## [1] "number of confirmed cases in Spain as of today:  4929546 with 3222 new cases"
## [1] "number of deaths in Spain as of today:  85783 with 44 new deaths"
## [1] "number of recovered cases in Spain as of today:  150376 with 0 new cases"
country_summary("China")
## [1] "number of confirmed cases in China as of today:  107838 with 49 new cases"
## [1] "number of deaths in China as of today:  4851 with 0 new deaths"
## [1] "number of recovered cases in China as of today:  99228 with 0 new cases"
country_summary("Egypt")
## [1] "number of confirmed cases in Egypt as of today:  295639 with 588 new cases"
## [1] "number of deaths in Egypt as of today:  16935 with 14 new deaths"
## [1] "number of recovered cases in Egypt as of today:  232179 with 0 new cases"
country_summary("Germany")
## [1] "number of confirmed cases in Germany as of today:  4137062 with 9904 new cases"
## [1] "number of deaths in Germany as of today:  92928 with 22 new deaths"
## [1] "number of recovered cases in Germany as of today:  3659260 with 0 new cases"
country_summary("France")
## [1] "number of confirmed cases in France as of today:  7029959 with 7756 new cases"
## [1] "number of deaths in France as of today:  116618 with 107 new deaths"
## [1] "number of recovered cases in France as of today:  415111 with 0 new cases"
Hossameldin Mohammed
Hossameldin Mohammed
Senior Machine Learning Engineer

My research interests include traffic safety, traffic simulation, transportation demand modeling, generative machine learning, imitation learning, deep learning and experimental design.