COVID-19 Exploratory Data Analysis

Last updated on Sep 18, 2021 5 min read

Introduction

This is a personal Rmarkdown document I have created to visualize the COVID-19 updates and some preliminary exploratory data analysis (EDA). The source of this data is the github repository created and maintained by the Coronavirus COVID-19 Global Cases by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University (JHU).

suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(forecast))
suppressPackageStartupMessages(library(zoo))
suppressPackageStartupMessages(library(xts))
suppressPackageStartupMessages(library(gridExtra))
suppressPackageStartupMessages(library(gghighlight))
suppressPackageStartupMessages(library(knitr))
suppressPackageStartupMessages(library(directlabels))
suppressPackageStartupMessages(library(scales))
suppressPackageStartupMessages(library(plotly))
#suppressPackageStartupMessages(library(rjson))

Reading the data

COVID_confirmed_global_raw <- read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv") 
COVID_deaths_global_raw <-  read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv") 
COVID_recovered_global_raw <-  read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv")

Reshaping and formatting data

# Reshape to longer format
COVID_confirmed_global_longer <- COVID_confirmed_global_raw %>% 
  pivot_longer(cols      = c('1/22/20':names(COVID_confirmed_global_raw)[ncol(COVID_confirmed_global_raw)]),
               names_to  = "date",
               values_to = "n_cases")
  

COVID_deaths_global_longer <- COVID_deaths_global_raw %>% 
  pivot_longer(cols      = c('1/22/20':names(COVID_deaths_global_raw)[ncol(COVID_deaths_global_raw)]),
               names_to  = "date",
               values_to = "n_cases")

COVID_recovered_global_longer <- COVID_recovered_global_raw %>% 
  pivot_longer(cols      = c('1/22/20':names(COVID_recovered_global_raw)[ncol(COVID_recovered_global_raw)]),
               names_to  = "date",
               values_to = "n_cases")


# change column names
colnames(COVID_confirmed_global_longer) <-  c('state', 'country', 'lat', 'long','date', 'n_cases')
colnames(COVID_deaths_global_longer) <-  c('state', 'country', 'lat', 'long','date', 'n_cases')
colnames(COVID_recovered_global_longer) <-  c('state', 'country', 'lat', 'long','date', 'n_cases')

# drop `state` column and create a `new_cases` column
COVID_confirmed_global_longer <- COVID_confirmed_global_longer %>% 
               select(-state)%>% 
               group_by(country, date) %>% 
               summarize(n_cases = sum(n_cases))

## `summarise()` has grouped output by 'country'. You can override using the `.groups` argument.

COVID_deaths_global_longer <- COVID_deaths_global_longer %>% 
               select(-state)%>% 
               group_by(country, date) %>% 
               summarize(n_cases = sum(n_cases))

## `summarise()` has grouped output by 'country'. You can override using the `.groups` argument.

COVID_recovered_global_longer <- COVID_recovered_global_longer %>% 
               select(-state) %>% 
               group_by(country, date) %>% 
               summarize(n_cases = sum(n_cases))

## `summarise()` has grouped output by 'country'. You can override using the `.groups` argument.

# convert date columns from character to date format
COVID_confirmed_global_longer$date <- as.Date(COVID_confirmed_global_longer$date, format = '%m/%d/%Y')
COVID_deaths_global_longer$date <- as.Date(COVID_deaths_global_longer$date, format = '%m/%d/%Y')
COVID_recovered_global_longer$date <- as.Date(COVID_recovered_global_longer$date, format = '%m/%d/%Y')

COVID_confirmed_global_longer <- COVID_confirmed_global_longer %>% 
  arrange(country, date) %>% 
  mutate(new_cases = n_cases-lag(n_cases, default = 0))

COVID_deaths_global_longer <- COVID_deaths_global_longer %>% 
  arrange(country, date) %>% 
  mutate(new_cases = n_cases-lag(n_cases, default = 0))

COVID_recovered_global_longer <- COVID_recovered_global_longer %>% 
  arrange(country, date) %>% 
  mutate(new_cases = n_cases-lag(n_cases, default = 0))

Let’s look at the current data format

knitr::kable(head(COVID_confirmed_global_longer),format = 'markdown')

country	date	n_cases	new_cases
Afghanistan	20-01-22	0	0
Afghanistan	20-01-23	0	0
Afghanistan	20-01-24	0	0
Afghanistan	20-01-25	0	0
Afghanistan	20-01-26	0	0
Afghanistan	20-01-27	0	0

creating some functions for quick stats

world_summary <- function() {
   
  df1 <- COVID_confirmed_global_longer %>% 
    group_by(country) %>% 
    summarize(n_cases_today = max(n_cases),
              new_cases_today = dplyr::last(new_cases)) %>%
    ungroup() %>% 
    summarize(n_cases_total = sum(n_cases_today),
              new_cases_total = sum(new_cases_today))
  
  df2 <- COVID_deaths_global_longer %>% 
    group_by(country) %>% 
    summarize(n_cases_today = max(n_cases),
              new_cases_today = dplyr::last(new_cases)) %>% 
    ungroup() %>% 
    summarize(n_cases_total = sum(n_cases_today),
              new_cases_total = sum(new_cases_today))
  
  df3 <- COVID_recovered_global_longer %>%
    group_by(country) %>% 
    summarize(n_cases_today = max(n_cases),
              new_cases_today = dplyr::last(new_cases)) %>% 
    ungroup() %>% 
    summarize(n_cases_total = sum(n_cases_today),
              new_cases_total = sum(new_cases_today))
  
  print(paste0("number of total confirmed cases in the world as of today:  ", df1$n_cases_total, " with ", df1$new_cases_total, " new cases"))
  print(paste0("number of total deaths in the world  as of today:  ", df2$n_cases_total, " with ", df2$new_cases_total, " new deaths"))
  print(paste0("number of total recovered cases in the world  as of today:  ", df3$n_cases_total, " with ", df3$new_cases_total, " new cases"))
  
}


country_summary <- function(country1) {
  
  df1 <- COVID_confirmed_global_longer %>% group_by(country) %>% dplyr::filter(country==country1) %>%  summarize(n_cases_today = max(n_cases),
              new_cases_today = dplyr::last(new_cases))
  df2 <- COVID_deaths_global_longer %>% group_by(country)%>% dplyr::filter(country==country1) %>% summarize(n_cases_today = max(n_cases),
              new_cases_today = dplyr::last(new_cases))
  df3 <- COVID_recovered_global_longer %>% group_by(country)%>% dplyr::filter(country==country1) %>% summarize(n_cases_today = max(n_cases),
              new_cases_today = dplyr::last(new_cases))
  #                                                            
  print(paste0("number of confirmed cases in ", country1, " as of today:  ", df1$n_cases_today, " with ", df1$new_cases_today, " new cases"))
  # df1$n_cases_today

  print(paste0("number of deaths in ", country1, " as of today:  ", df2$n_cases_today, " with ", df2$new_cases_today, " new deaths"))
  # df2$n_cases_today

  print(paste0("number of recovered cases in ", country1, " as of today:  ", df3$n_cases_today, " with ", df3$new_cases_today, " new cases"))
  # df3$n_cases_today
  
}


world_summary()

## [1] "number of total confirmed cases in the world as of today:  227649349 with 593099 new cases"
## [1] "number of total deaths in the world  as of today:  4679139 with 8881 new deaths"
## [1] "number of total recovered cases in the world  as of today:  137249983 with 0 new cases"

country_summary("US")

## [1] "number of confirmed cases in US as of today:  41993789 with 207886 new cases"
## [1] "number of deaths in US as of today:  672635 with 2635 new deaths"
## [1] "number of recovered cases in US as of today:  6298082 with 0 new cases"

country_summary("Italy")

## [1] "number of confirmed cases in Italy as of today:  4627699 with 4544 new cases"
## [1] "number of deaths in Italy as of today:  130233 with 66 new deaths"
## [1] "number of recovered cases in Italy as of today:  4144608 with 0 new cases"

country_summary("Spain")

## [1] "number of confirmed cases in Spain as of today:  4929546 with 3222 new cases"
## [1] "number of deaths in Spain as of today:  85783 with 44 new deaths"
## [1] "number of recovered cases in Spain as of today:  150376 with 0 new cases"

country_summary("China")

## [1] "number of confirmed cases in China as of today:  107838 with 49 new cases"
## [1] "number of deaths in China as of today:  4851 with 0 new deaths"
## [1] "number of recovered cases in China as of today:  99228 with 0 new cases"

country_summary("Egypt")

## [1] "number of confirmed cases in Egypt as of today:  295639 with 588 new cases"
## [1] "number of deaths in Egypt as of today:  16935 with 14 new deaths"
## [1] "number of recovered cases in Egypt as of today:  232179 with 0 new cases"

country_summary("Germany")

## [1] "number of confirmed cases in Germany as of today:  4137062 with 9904 new cases"
## [1] "number of deaths in Germany as of today:  92928 with 22 new deaths"
## [1] "number of recovered cases in Germany as of today:  3659260 with 0 new cases"

country_summary("France")

## [1] "number of confirmed cases in France as of today:  7029959 with 7756 new cases"
## [1] "number of deaths in France as of today:  116618 with 107 new deaths"
## [1] "number of recovered cases in France as of today:  415111 with 0 new cases"

COVID-19 Exploratory Data Analysis

Introduction

Reading the data

Reshaping and formatting data

creating some functions for quick stats

Hossameldin Mohammed

Senior Machine Learning Engineer