Thetazero Pubs
marileksadata wind winds grep storm libraries library fatalities flood high

SYNOPSIS:

  1. Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?
  2. Across the United States, which types of events have the greatest economic consequences?

Data Dictionary ( raw data )

# Variable Variable Description Value Description
1 EVTYPE Event Type Factor w/ 985 levels - non-normalized strings
2 FATALITIES Fatalities number
3 INJURIES Injuries number
4 PROPDMG Property Damage base number for calculation using PROPDMGEXP
5 PROPDMGEXP symbol for exponential calculation Factor w/ 18 levels - ? + 0 1 2 3 4 5 6 7 8 B h H K m M - non-normalized strings
6 CROPDMG Crop Damage base number for calculation using PROPDMGEXP
7 CROPDMGEXP symbol for exponential calculation Factor w/ 8 levels ? 0 2 B k K m M - non-normalized strings
8 BGN_DATE Starting Date D/M/YYYY H:MM:SS
9 STATE State Abbreviation Factor w/ 72 levels - non-normalized strings
10 REMARKS Comments user stories

Data Dictionary ( added variables )

# Variable Variable Description Value Description
1 NEW_EVTYPE Event Type normalized strings
2 YEAR Year of the event number: YYYY
3 HARM Total Harm to Health number calculated with FATALITIES and INJURIES
4 DAMAGE Total Economic Damage number calculated with PROPDMG and CROPDMG
5 PDMG Property Damage number calculated with PROPDMG and PROPDMGEXP
6 CDMG Crop Damage number calculated with CROPDMG and CROPDMGEXP

DATA PROCESSING

#cleanup the environment, set working directory to where the data file resides
rm(list=ls(all=TRUE)) 
setwd("~/RepData_PeerAssessment2/")

#load libraries
suppressWarnings(suppressMessages(library(dplyr)))
library(knitr)
library(ggplot2)
library(lubridate)
library(scales)
library(grid)

download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", 
              destfile="repdata-data-StormData.csv.bz2", method="curl")

# documentation shows 48 official event types
# note: I have changed 'Hurricane (Typhoon)' to 'Hurricane' and 'Debris Flow' to 'Debris'
# to obtain better mapping
etype <- tolower(c("Astronomical Low Tide", "Avalanche", "Blizzard", "Coastal Flood", "Cold/Wind Chill", 
                   "Debris","Dense Fog","Dense Smoke","Drought","Dust Devil","Dust Storm","Excessive Heat",
                   "Extreme Cold/Wind Chill","Flash Flood","Flood","Frost/Freeze","Funnel Cloud","Freezing Fog",
                   "Hail","Heat","Heavy Rain","Heavy Snow","High Surf","High Wind","Hurricane",
                   "Ice Storm","Lake-Effect Snow","Lakeshore Flood","Lightning","Marine Hail",
                   "Marine High Wind","Marine Strong Wind","Marine Thunderstorm Wind", 
                   "Rip Current","Seiche","Sleet", "Storm Surge/Tide","Strong Wind", "Thunderstorm Wind",
                   "Tornado","Tropical Depression","Tropical Storm","Tsunami","Volcanic Ash", 
                   "Waterspout","Wildfire", "Winter Storm" ,"Winter Weather"))


# read only relevant columns
p <- select(read.csv("repdata-data-StormData.csv.bz2",header=TRUE, 
                     na.strings=c("NA","N/A","")), EVTYPE, FATALITIES, INJURIES, 
            PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP, BGN_DATE, STATE, REMARKS)

# add variable 'YEAR' and set it to YYYY
p$YEAR <- mdy_hms(p$BGN_DATE)  
p$YEAR <- year(p$YEAR)

#the data is not complete for years before 1996 therefore I subset for years between 1996 and 2011
y <- subset(p, YEAR >= 1996) 

#correct the 'flood' event in Napa River on 1/1/2006 - 115 B -> 115 M  
y$PROPDMGEXP[y$PROPDMG == 115 & y$PROPDMGEXP == "B"] <- c("M")

# correct exponent and set new variables according to the officially confirmed values of exponent
# see page 12 of https://d396qusza40orc.cloudfront.net/repdata%2Fpeer2_doc%2Fpd01016005curr.pdf
# note: I have decided to only use K|k, M|m, B|b for calculations  and use exp 1 for the rest of the data
y$PDMG <- c(1)
y$CDMG <- c(1)
y$PDMG[grep("K|k", y$PROPDMGEXP)] <- c(1000)
y$CDMG[grep("K|k", y$CROPDMGEXP)] <- c(1000)

y$PDMG[grep("M|m", y$PROPDMGEXP)] <- c(1000000)
y$CDMG[grep("M|m", y$CROPDMGEXP)] <- c(1000000)

y$PDMG[grep("B|b", y$PROPDMGEXP)] <- c(1000000000)
y$CDMG[grep("B|b", y$CROPDMGEXP)] <- c(1000000000)

# set PDMG and CDMG to in cases where PROPDMGEXP and CROPDMGEXP are missing
y$PDMG[is.na(y$PROPDMGEXP) & y$PDMG == 1] <- c(0) 
y$CDMG[is.na(y$CROPDMGEXP) & y$CDMG == 1] <- c(0) 

# calculate PDMG and CDMG in cases number is present and exponent is provided
y$PDMG[y$PROPDMG > 0] <- y$PDMG[y$PROPDMG > 0] * y$PROPDMG[y$PROPDMG > 0]
y$CDMG[y$CROPDMG > 0] <- y$CDMG[y$CROPDMG > 0] * y$CROPDMG[y$CROPDMG > 0]

# normalize event types by setting them to lower case
y$EVTYPE <- tolower(y$EVTYPE)

# add new variable and set it to 'other' first, then loop through official event types,
# set all records with matching cases 
y$NEW_EVTYPE <- c("other")
for (index in 1:48){
        u <- grep(etype[index], y$EVTYPE)
        z <- y$EVTYPE[u]
        if (length(z) >= 1) {
                y$NEW_EVTYPE[u] <- etype[index]
        }
}

# correct event types that did not match the official 48 
y$NEW_EVTYPE[grep("tstm wind|thunderstorm|tstm|tstm wnd", y$EVTYPE)] <- c("thunderstorm wind")
y$NEW_EVTYPE[grep("wild|fire", y$EVTYPE)] <- c("wildfire")
y$NEW_EVTYPE[grep("light snow|snow/blowing snow|winter mix|cold weather|wintery mix", y$EVTYPE)] <- c("winter weather")
y$NEW_EVTYPE[grep("extreme cold|unseasonably cold|unusually cold|cool spell|record cool|record cold", y$EVTYPE)] <- c("extreme cold/wind chill")
y$NEW_EVTYPE[grep("wind chil|windchill|prolong cold|cold temperatures|cold temperature", y$EVTYPE)] <- c("cold/wind chill")
y$NEW_EVTYPE[grep("wind gusts|wnd| wind|gusty lake wind|whirlwind|winds|wind", y$EVTYPE)] <- c("high wind")
y$NEW_EVTYPE[grep("fog|vog", y$EVTYPE)] <- c("dense fog")
y$NEW_EVTYPE[grep("storm surge|wind and wave|high seas|rough seas", y$EVTYPE)] <- c("storm surge/tide")
y$NEW_EVTYPE[grep("dry|record warmth|driest month|record low rainfall|hot weather", y$EVTYPE)] <- c("heat")
y$NEW_EVTYPE[grep("unseasonably hot|hot spell|record warm|very warm|prolong warmth|unusually warm", y$EVTYPE)] <- c("heat")
y$NEW_EVTYPE[grep("smoke", y$EVTYPE)] <- c("dense smoke")
y$NEW_EVTYPE[grep("freezing rain|freezing drizzle|frost|freeze", y$EVTYPE)] <- c("frost/freeze")
y$NEW_EVTYPE[grep("dust", y$EVTYPE)] <- c("dust storm")
y$NEW_EVTYPE[grep("typhoon", y$EVTYPE)] <- c("hurricane")
y$NEW_EVTYPE[grep("astronomical high tide|heavy seas|rogue wave|high water", y$EVTYPE)] <- c("storm surge/tide")
y$NEW_EVTYPE[grep("landslide|record rainfall|excessive rainfall|rain (heavy)|mudslide", y$EVTYPE)] <- c("heavy rain")
y$NEW_EVTYPE[grep("ice", y$EVTYPE)] <- c("ice storm")
y$NEW_EVTYPE[grep("record snow|accumulated snowfall|excessive snow|snow showers", y$EVTYPE)] <- c("heavy snow")

# subset and check if the number of event types set to "other" could be acceptable to continue
s <- subset(y, INJURIES > 0 | FATALITIES > 0 | PROPDMG > 0 | CROPDMG > 0)
length(s$NEW_EVTYPE[s$NEW_EVTYPE == "other"])
## [1] 929
length(s$NEW_EVTYPE[s$NEW_EVTYPE != "other"])
## [1] 200389
# add new var and sum up fitalities and enjuries since both contribute to overall population health
# that is being analized 
s$HARM <- s$FATALITIES + s$INJURIES

# add new variable and sum up property and crop damage
s$DAMAGE <- s$PDMG + s$CDMG

# subset using only variables that are needed for further analysis and plotting,
# check if any NA exist ( TRUE would mean: no NA)
s <- select(s, NEW_EVTYPE, HARM, DAMAGE, FATALITIES, INJURIES, PDMG, CDMG)
all(colSums(is.na(s)) == 0)
## [1] TRUE
# data to plot
str(s)
## 'data.frame':    201318 obs. of  7 variables:
##  $ NEW_EVTYPE: chr  "winter storm" "tornado" "high wind" "high wind" ...
##  $ HARM      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ DAMAGE    : num  418000 100000 3000 5000 2000 400000 12000 8000 12000 75000 ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ INJURIES  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PDMG      : num  380000 100000 3000 5000 2000 400000 12000 8000 12000 75000 ...
##  $ CDMG      : num  38000 0 0 0 0 0 0 0 0 0 ...

HARM TO POPULATION HEALTH FROM THE TOP 5 EVENT TYPES

# analysis was done for top 20 and 10 which confirmed that 5 would be enough to draw necessary 
# conclusions to answer the questions of this assignment

#group by event type and calculate totals separately fatalities vs injuries
a1 <- aggregate(FATALITIES ~ NEW_EVTYPE, data=s, FUN=sum)
a1 <- head(arrange(a1, desc(FATALITIES),NEW_EVTYPE), 5)
a1$HARM_TYPE <- c('fatalities')
a1$FATALITIES <- a1$FATALITIES/1000
colnames(a1) <- c('Event Type', 'Harm', 'Harm Type')

a2 <- aggregate(INJURIES ~ NEW_EVTYPE, data=s, FUN=sum)
a2 <- head(arrange(a2, desc(INJURIES),NEW_EVTYPE), 5)
a2$HARM_TYPE <- c('injuries')
a2$INJURIES <- a2$INJURIES/1000
colnames(a2) <- c('Event Type', 'Harm', 'Harm Type')

a <- rbind(a1, a2)
colnames(a) <- c('Event Type', 'Harm', 'Harm Type')
arrange(a, desc(Harm))
##    Event Type   Harm  Harm Type
## 1     tornado 20.667   injuries
## 2       flood  8.441   injuries
## 3        heat  7.708   injuries
## 4   high wind  6.712   injuries
## 5   lightning  4.141   injuries
## 6        heat  2.039 fatalities
## 7     tornado  1.511 fatalities
## 8       flood  1.309 fatalities
## 9   high wind  1.021 fatalities
## 10  lightning  0.651 fatalities
# group by event type and calculate totals for both fatalities and injuries
aggr <- aggregate(HARM ~ NEW_EVTYPE, data=s, FUN=sum)
k <- head(arrange(aggr, desc(HARM), NEW_EVTYPE), 5)
k$HARM <- k$HARM/1000
colnames(k) <- c("Event Type", "Fatalities & Injuries (in thousands)")
k
##   Event Type Fatalities & Injuries (in thousands)
## 1    tornado                               22.178
## 2      flood                                9.750
## 3       heat                                9.747
## 4  high wind                                7.733
## 5  lightning                                4.792
aggr <- aggregate(HARM ~ NEW_EVTYPE, data=s, FUN=sum)
k <- head(arrange(aggr, desc(HARM),NEW_EVTYPE), 5)
c <- ggplot(k, aes(x=NEW_EVTYPE, y=HARM))
c + geom_bar(stat = "identity", position="dodge") +
        theme(
              plot.margin = unit(c(4,4,6,4),"mm"),
              axis.text = element_text(colour = "white", size=10, face="bold"),
              axis.title.x = element_text(colour = "pink", size=rel(3)),
              axis.title.y = element_text(colour = "black"),
              panel.background = element_rect(fill="gray"),
              panel.grid.minor.y = element_line(size=3),
              panel.grid.major = element_line(colour = "orange"),
              plot.background = element_rect(fill="orange") ) +
        annotate("text", "tornado", 21000,label = "22,178", 
                 fontface="bold", colour="white", size=7) +
        annotate("text", "flood", 8000,label = "9,750", 
                 fontface="bold", colour="white", size=7) +
        annotate("text", "heat", 8000,label = "9,747", 
                 fontface="bold", colour="white", size=7) +
        annotate("text", "high wind", 6000,label = "7,733", 
                 fontface="bold", colour="white", size=7) +
        annotate("text", "lightning", 3000,label = "4,792", 
                 fontface="bold", colour="white", size=7) +
        scale_y_continuous(breaks=seq(0,20000,5000)) +
        labs(x = "") + 
        labs(y = "") + 
        labs(title = "Fatalities and Injuries for top 5 events", face = "bold")   

ECONOMIC CONSEQUENCES FROM TOP 5 EVENT TYPES

# analysis was done for top 20 and 10 which confirmed that 5 would be enough to draw necessary 
# conclusions to answer the questions of this assignment

# group by event type and calculate totals for property vs crop
a1 <- aggregate(PDMG ~ NEW_EVTYPE, data=s, FUN=sum)
a1 <- head(arrange(a1, desc(PDMG), NEW_EVTYPE), 5)
a1$DAMAGE_TYPE <- c('property')
a1$PDMG <- a1$PDMG / 1000000000 
colnames(a1) <- c('Event Type', 'Damage', 'Damage Type')

a2 <- aggregate(CDMG ~ NEW_EVTYPE, data=s, FUN=sum)
a2 <- head(arrange(a2, desc(CDMG),NEW_EVTYPE), 5)
a2$DAMAGE_TYPE <- c('crop')
a2$CDMG <- a2$CDMG/1000000000 
colnames(a2) <- c('Event Type', 'Damage', 'Damage Type')

a <- rbind(a1, a2)
arrange(a, desc(Damage))
##          Event Type    Damage Damage Type
## 1         hurricane 81.718890    property
## 2  storm surge/tide 47.845164    property
## 3             flood 44.815448    property
## 4           tornado 24.617149    property
## 5              hail 14.595822    property
## 6           drought 13.367574        crop
## 7             flood  6.369928        crop
## 8         hurricane  5.350125        crop
## 9              hail  2.503061        crop
## 10        high wind  1.813134        crop
colnames(a) <- c('Event Type', 'Damage (in billion USD)', 'Damage Type')

# group by event type and calculate totals for both property and crop
aggr <- aggregate(DAMAGE ~ NEW_EVTYPE, data=s, FUN=sum)
k <- head(arrange(aggr, desc(DAMAGE),NEW_EVTYPE), 5)
k$DAMAGE <- k$DAMAGE/1000000000 
colnames(k) <- c("Event Type", "Property & Crop Damage (in billion USD)")
k
##         Event Type Property & Crop Damage (in billion USD)
## 1        hurricane                                87.06901
## 2            flood                                51.18538
## 3 storm surge/tide                                47.84606
## 4          tornado                                24.91726
## 5             hail                                17.09888
aggr <- aggregate(DAMAGE ~ NEW_EVTYPE, data=s, FUN=sum)
k <- head(arrange(aggr, desc(DAMAGE),NEW_EVTYPE), 5)
c <- ggplot(k, aes(x=NEW_EVTYPE, y=DAMAGE))
c + geom_bar(stat = "identity", position="dodge") +
        theme(
                plot.margin = unit(c(4,4,6,4),"mm"),
                axis.text = element_text(colour = "white", size=10, face="bold"),
                axis.title.x = element_text(colour = "pink", size=rel(3)),
                axis.title.y = element_text(colour = "black"),
                panel.background = element_rect(fill="gray"),
                panel.grid.minor.y = element_line(size=3),
                panel.grid.major = element_line(colour = "orange"),
                plot.background = element_rect(fill="orange") ) +
        
        annotate("text", "flood", 45000000000,label = "51 B", 
                 fontface="bold", colour="white", size=7) +
        annotate("text", "hurricane", 80000000000,label = "87 B", 
                 fontface="bold", colour="white", size=7) +
        annotate("text", "hail", 12000000000,label = "17 B", 
                 fontface="bold", colour="white", size=7) +
        annotate("text", "storm surge/tide", 43000000000,label = "48 B", 
                 fontface="bold", colour="white", size=7) +
        annotate("text", "tornado", 20000000000,label = "25 B", 
                 fontface="bold", colour="white", size=7) +
       
        scale_y_continuous(breaks=seq(0,75000000000,25000000000), label = c("0","25","50", "75")) +
        labs(x = "") + 
        labs(y = "") + 
        labs(title = "Property and Crop Damage (in billion USD) for top 5 events", face = "bold")   

RESULTS

Copyright © 2016 thetazero.com All Rights Reserved. Privacy Policy