Data on holdover time of lightning-ignited wildfires

Basic information

Author: Jose V. Moris

Contact:

Personal website: https://sites.google.com/site/josevmoris/

Affiliation: DISAFA, University of Turin (Italy)

Date: November 2022

R version: 4.1.0

RStudio Desktop version: 1.4.1106

Platform: Windows 10 64-bit


Data loading

First load some necessary libraries. Then load the data and get some information. Make sure that the 3 csv files containing the data are located in the working directory.

# load libraries
library(tidyverse) # version 1.3.1
library(knitr) # version 1.33
# load the data
CD <- read.csv("1_censored_data.csv") # censored data
NCD <- read.csv("2_non_censored_data.csv") # non-censored data
AD <- read.csv("3_ancillary_data.csv") # ancillary data

# check
class(CD)
## [1] "data.frame"
names(CD)
##  [1] "Study_id"          "Reference"         "Time_interval"    
##  [4] "Time_interval_d"   "Lower_limit_d"     "Upper_limit_d"    
##  [7] "N_fires"           "RF"                "CRF"              
## [10] "Original_data"     "Data_location"     "Collection_method"
str(CD)
## 'data.frame':    2311 obs. of  12 variables:
##  $ Study_id         : chr  "CON2006CH01" "CON2006CH01" "CON2006CH01" "CON2006CH01" ...
##  $ Reference        : chr  "Conedera et al 2006" "Conedera et al 2006" "Conedera et al 2006" "Conedera et al 2006" ...
##  $ Time_interval    : chr  "1 day" "1 day" "1 day" "1 day" ...
##  $ Time_interval_d  : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ Lower_limit_d    : num  0 1 2 3 4 5 6 0 1 2 ...
##  $ Upper_limit_d    : num  1 2 3 4 5 6 7 1 2 3 ...
##  $ N_fires          : int  117 28 4 1 2 1 1 15 6 1 ...
##  $ RF               : num  0.75974 0.18182 0.02597 0.00649 0.01299 ...
##  $ CRF              : num  0.76 0.942 0.968 0.974 0.987 ...
##  $ Original_data    : chr  "N" "N" "N" "N" ...
##  $ Data_location    : chr  "Figure 4" "Figure 4" "Figure 4" "Figure 4" ...
##  $ Collection_method: chr  "Personal communication" "Personal communication" "Personal communication" "Personal communication" ...
dim(CD)
## [1] 2311   12


Censored data

Plots

Create some barplots to observe the distributions of holdover time. We are using barplots because the censored data consist of frequency distributions.

# barplots
bp1 <- CD %>% filter(Study_id %in% c("CON2006CH01")) # choose one study

ggplot(bp1, aes(x = factor(round(Upper_limit_d, 2)), y = RF*100)) + # barplot
  geom_bar(stat="identity") +
  labs(x = "Holdover time (days)", y = "Relative frequency of LIW (%)")

bp2 <- CD %>% filter(Study_id %in% c("KOU1967CA01")) # choose one study

ggplot(bp2, aes(x = factor(round(Upper_limit_d, 2)), y = RF*100)) + # barplot
  geom_bar(stat="identity") +
  labs(x = "Holdover time (days)", y = "Relative frequency of LIW (%)")

bp3 <- CD %>% filter(Study_id %in% c("DOR2022AU01")) # choose one study

ggplot(bp3, aes(x = Upper_limit_d, y = RF*100)) + # barplot
  geom_bar(stat="identity") +
  labs(x = "Holdover time (days)", y = "Relative frequency of LIW (%)")

bp <- CD %>% filter(Study_id %in% c("CON2006CH01", # choose several studies
                                    "KOU1967CA01",
                                    "PIN2017ES01",
                                    "DOR2022AU01")) %>% 
  mutate(day = cut(Upper_limit_d, # calculate daily relative frequency
                   seq(0, max(Upper_limit_d, na.rm = T) +1))) %>%
  group_by(Study_id, day) %>% 
  summarize(RF = sum(RF))

bp %>% group_by(Study_id) %>%
  summarize(check = sum(RF)) # the values must be 1
## # A tibble: 4 x 2
##   Study_id    check
##   <chr>       <dbl>
## 1 CON2006CH01  1.00
## 2 DOR2022AU01  1.00
## 3 KOU1967CA01  1   
## 4 PIN2017ES01  1.00
ggplot(bp, aes(x = day, y = RF*100)) + # barplot with several distributions
  geom_bar(stat="identity") +
  labs(x = "Holdover time (days)", y = "Relative frequency of LIW (%)") +
  geom_text(aes(label = round(RF*100, 1), vjust = -0.25)) +
  ylim(0,100) +
  facet_wrap(~Study_id, ncol = 1)


Statistics

Calculate some summary statistics using the frequency distributions of holdover time. Note that some values may be missing due to the nature of the censored data.

# calculate the mean and maximum HOT (holdover time)
cd_mean <- CD %>% group_by(Study_id) %>%
  summarize(mean = sum((Lower_limit_d + Time_interval_d/2)*RF)*24, # mean HOT in hours
            max = max(Upper_limit_d)) %>% # maximum HOT in days
  select(Study_id, mean, max)

# calculate the median HOT
cd_median <- CD %>% mutate(a = CRF - 0.5, # create a secondary variable
                           a = ifelse(a < 0, 100, a)) %>%
  group_by(Study_id) %>%
  filter(a == min(a)) %>% # find the interval in which the median value is
  filter(Lower_limit_d == min(Lower_limit_d)) %>% # in case repeated values exist
  mutate(median = (Lower_limit_d + ( 0.5 - (CRF-RF) )/RF*Time_interval_d)*24) %>% # median HOT in hours
  select(Study_id, median)

# calculate the cumulative relative frequency (CRF) of HOT of day 1
cd_crf_d1 <- CD %>% filter(Upper_limit_d==1) %>%
  rename(crf_d1 = CRF) %>%
  mutate(crf_d1 = crf_d1*100) %>% # CRF of HOT of the first day in percentage
  select(Study_id, crf_d1)

# join all the statistics
cd_sta <- cd_mean %>% left_join(cd_median, by = "Study_id") %>%
  left_join(cd_crf_d1, by = "Study_id") %>%
  rename(mean_h = mean,
         max_d = max,
         median_h = median,
         crf_d1_pct = crf_d1)

kable(cd_sta, caption = "Summary statistics of censored data") # table
Summary statistics of censored data
Study_id mean_h max_d median_h crf_d1_pct
BAR1951US01 NA NA 4.158568 78.99560
BAR1978US01 NA NA 2.990812 90.17000
CON2006CH01 21.038962 7.000000 15.794872 75.97403
CON2006IT01 30.240000 6.000000 20.000000 60.00000
DOR2022AU01 11.031065 5.000000 1.471201 87.39855
DOW2009AU01 320.968280 90.000000 18.541702 64.71898
DUN2010US01 146.400000 23.000000 23.389830 51.30435
GIS1926US01 NA NA 4.750000 85.00000
GIS1931US01 NA NA 4.000000 86.00000
HES2022CA01 51.665454 5.000000 47.680000 23.09091
HES2022US01 44.597015 5.000000 38.550000 25.87065
HES2022US02 41.686411 5.000000 37.043478 28.22300
KOU1967CA01 NA NA 18.378379 59.60000
LAR2005FI01 NA NA 34.666667 42.45283
MAC2019US01 72.336000 11.000000 45.438735 27.40000
MEN2022BR01 22.911318 2.708333 21.595238 61.50943
MOR1948US01 NA NA 6.420601 78.49000
MOR2020CH01 30.777557 9.916667 13.125001 63.11787
MOR2020IT01 26.718750 6.250000 6.000000 71.87500
MUL2021AT01 33.940594 10.000000 17.911330 66.99670
NAS1996CA01 61.627601 15.000000 27.251799 47.78518
PER2021ES01 37.897135 14.000000 5.709677 72.79793
PER2021FR01 37.305558 10.083333 3.000000 75.00000
PER2021GR01 33.829320 3.958333 29.200000 43.43545
PER2021PT01 23.004858 3.875000 15.928572 64.07767
PER2022US01 20.769134 7.000000 12.054688 75.92446
PER2022US02 21.541219 6.958333 13.107143 74.34088
PIN2014ES01 8.597668 3.000000 1.409283 NA
PIN2017ES01 NA NA 1.611111 87.11485
PIN2022ES01 14.271973 9.708333 1.833333 84.40276
SCH2019US01 39.041412 15.000000 17.645756 68.00502
SHO1923US01 NA NA 15.257143 67.00000
SHO1930US01 NA NA 12.827586 68.00000
TAY1969US01 NA NA 10.169491 77.00000
WOT2005CA01 85.945445 28.000000 44.549098 33.46876
WOT2022CA01 88.944727 22.000000 21.186312 56.64035
WOT2022CA02 81.169302 22.000000 19.001683 63.15230
WOT2022CA03 52.845977 22.000000 18.052219 66.47382
WOT2022CA04 59.750589 22.000000 24.905222 49.34630
WOT2022CA05 75.351697 22.000000 39.796767 38.78836
WOT2022CA06 82.683820 22.000000 53.602617 25.53404
XUW2022RU01 47.348838 8.000000 38.692683 30.54264


Non-censored data

Statistics

Calculate some summary statistics using non-censored data of holdover time. Note that the data may need some manipulation before being analyzed.

# check
class(NCD)
## [1] "data.frame"
names(NCD)
## [1] "MOR2020IT01" "MOR2020CH01" "PER2021FR01" "PER2021PT01" "PER2021ES01"
## [6] "MEN2022BR01" "PER2022US01" "PER2022US02" "PIN2022ES01"
str(NCD)
## 'data.frame':    6301 obs. of  9 variables:
##  $ MOR2020IT01: num  0.628 0.837 1.194 1.79 2.159 ...
##  $ MOR2020CH01: num  0.0181 0.0353 0.0769 0.0928 0.0986 ...
##  $ PER2021FR01: num  0.133 0.217 0.233 0.25 0.483 ...
##  $ PER2021PT01: num  0.0167 0.0167 0.0667 0.0667 0.0667 ...
##  $ PER2021ES01: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ MEN2022BR01: num  0.0167 0.1592 0.2658 0.3611 0.4486 ...
##  $ PER2022US01: num  0.000556 0.002778 0.003056 0.004444 0.005 ...
##  $ PER2022US02: num  0 0.000278 0.001667 0.002222 0.003333 ...
##  $ PIN2022ES01: num  0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 ...
dim(NCD)
## [1] 6301    9
# an option to manipulate non-censored data
ncd <- as.list(NCD) # from dataframe to list
ncd <- lapply(ncd, na.omit) # remove NA values

# some statistics
lapply(ncd, length)
## $MOR2020IT01
## [1] 32
## 
## $MOR2020CH01
## [1] 263
## 
## $PER2021FR01
## [1] 36
## 
## $PER2021PT01
## [1] 309
## 
## $PER2021ES01
## [1] 2702
## 
## $MEN2022BR01
## [1] 265
## 
## $PER2022US01
## [1] 6301
## 
## $PER2022US02
## [1] 2693
## 
## $PIN2022ES01
## [1] 1013
sum(unlist(lapply(ncd, length))) # 13,614 fires
## [1] 13614
lapply(ncd, mean)
## $MOR2020IT01
## [1] 26.69755
## 
## $MOR2020CH01
## [1] 30.77072
## 
## $PER2021FR01
## [1] 37.3287
## 
## $PER2021PT01
## [1] 22.98403
## 
## $PER2021ES01
## [1] 37.83713
## 
## $MEN2022BR01
## [1] 22.86785
## 
## $PER2022US01
## [1] 20.75002
## 
## $PER2022US02
## [1] 21.5038
## 
## $PIN2022ES01
## [1] 14.17972
lapply(ncd, median)
## $MOR2020IT01
## [1] 6.275556
## 
## $MOR2020CH01
## [1] 13.00722
## 
## $PER2021FR01
## [1] 3.716667
## 
## $PER2021PT01
## [1] 15.55
## 
## $PER2021ES01
## [1] 5.725
## 
## $MEN2022BR01
## [1] 21.65639
## 
## $PER2022US01
## [1] 12.10611
## 
## $PER2022US02
## [1] 13.08028
## 
## $PIN2022ES01
## [1] 1.7
lapply(ncd, min)
## $MOR2020IT01
## [1] 0.6283333
## 
## $MOR2020CH01
## [1] 0.01805556
## 
## $PER2021FR01
## [1] 0.1333333
## 
## $PER2021PT01
## [1] 0.01666667
## 
## $PER2021ES01
## [1] 0
## 
## $MEN2022BR01
## [1] 0.01666667
## 
## $PER2022US01
## [1] 0.00055556
## 
## $PER2022US02
## [1] 0
## 
## $PIN2022ES01
## [1] 0.01
lapply(ncd, max)
## $MOR2020IT01
## [1] 149.1961
## 
## $MOR2020CH01
## [1] 237.3253
## 
## $PER2021FR01
## [1] 241.85
## 
## $PER2021PT01
## [1] 92.38333
## 
## $PER2021ES01
## [1] 335.6667
## 
## $MEN2022BR01
## [1] 64.43639
## 
## $PER2022US01
## [1] 167.8397
## 
## $PER2022US02
## [1] 166.8367
## 
## $PIN2022ES01
## [1] 232.9333
# another option to manipulate non-censored data
ncd2 <-data.frame(unlist(ncd)) # from list to dataframe
ncd2 <- rownames_to_column(ncd2, "Study_id") %>% # rename variables
  rename(HOT = unlist.ncd.) %>%
  mutate(Study_id = substr(Study_id, 1, 11)) # keep the Study_id variable correct
dim(ncd2)
## [1] 13614     2
# additional statistics
ncd_sta <- ncd2 %>% group_by(Study_id) %>% summarize(
  n = n(), # number fires
  median_h = median(HOT), # median HOT in hours
  mean_h = mean(HOT), # mean HOT in hours
  min_min = min(HOT)*60, # minimum HOT in minutes
  max_d = max(HOT)/24, # maximum HOT in days
  crf_d1_pct = sum(HOT <= 24) / n *100, # CRF of HOT of the firs day in percentage
  crf_d3_pct = sum(HOT <= 72) / n *100) # CRF of HOT of the first 3 days in percentage

kable(ncd_sta, caption = "Summary statistics of non-censored data") # table
Summary statistics of non-censored data
Study_id n median_h mean_h min_min max_d crf_d1_pct crf_d3_pct
MEN2022BR01 265 21.656389 22.86785 1.0000002 2.684849 61.50943 100.00000
MOR2020CH01 263 13.007222 30.77072 1.0833336 9.888553 63.11787 87.45247
MOR2020IT01 32 6.275556 26.69755 37.6999998 6.216505 71.87500 84.37500
PER2021ES01 2702 5.725000 37.83713 0.0000000 13.986111 72.79793 84.08586
PER2021FR01 36 3.716667 37.32870 7.9999998 10.077083 75.00000 80.55556
PER2021PT01 309 15.550000 22.98403 1.0000002 3.849306 64.07767 96.76375
PER2022US01 6301 12.106111 20.75002 0.0333336 6.993322 75.92446 93.49310
PER2022US02 2693 13.080278 21.50380 0.0000000 6.951528 74.34088 92.38767
PIN2022ES01 1013 1.700000 14.17972 0.6000000 9.705556 84.40276 96.34748


Plots

Create some histograms to observe the distributions of holdover time. We are using histograms because the non-censored data consist of estimated continuous values of holdover time (in hours) without any censoring.

# histograms
h1 <- ncd2 %>% filter(Study_id %in% c("PER2022US01")) # choose one study

ggplot(h1, aes(x=HOT, y=..count../sum(..count..)*100)) + # hourly histogram
  geom_histogram(colour="black", boundary = 0, binwidth = 1) +
  scale_x_continuous(breaks=c(0, 24, 48, 72, 96, 120, 144, 168)) +
  labs(x = "Holdover time (hours)", y = "Relative frequency of LIW (%)") +
  ggtitle(h1$Study_id[1])

ggplot(h1, aes(x=HOT, y=..count../sum(..count..)*100)) + # daily histogram
  geom_histogram(colour="black", boundary = 0, binwidth = 24) +
  scale_x_continuous(breaks=c(0, 24, 48, 72, 96, 120, 144, 168)) +
  labs(x = "Holdover time (hours)", y = "Relative frequency  of LIW (%)") +
  ggtitle(h1$Study_id[1]) +
  stat_bin(binwidth = 24, geom="text", colour="black", size=3, boundary = 0,
           aes(label=round(..count../sum(..count..)*100, 1)), vjust=-0.5)


Ancillary data

Filtering

The records of ancillary data can be filtered to select particular studies based on their characteristics. This can be done easily with the function dplyr::filter.

# check
class(AD)
## [1] "data.frame"
names(AD)
##  [1] "Study_id"           "Reference"          "Publication_type"  
##  [4] "Study_area"         "Country"            "ISO_code"          
##  [7] "Spatial_scale"      "Biome"              "Ecozone"           
## [10] "Climate_class"      "Start_year"         "End_year"          
## [13] "Length_year"        "Min_time_h"         "Max_time_h"        
## [16] "Number_fires"       "Number_records"     "Fire_detection"    
## [19] "Fire_data_source"   "LLS"                "Lightning_level"   
## [22] "DE_pct"             "LA_km"              "Method"            
## [25] "Buffer_distance_km" "Temporal_window_d"  "Max_holdover_d"    
## [28] "Selection_criteria" "Dataset"            "Data_collector"    
## [31] "Date_entry"         "Data_check"         "Comments"
str(AD)
## 'data.frame':    42 obs. of  33 variables:
##  $ Study_id          : chr  "CON2006CH01" "CON2006IT01" "KOU1967CA01" "NAS1996CA01" ...
##  $ Reference         : chr  "Conedera et al 2006" "Conedera et al 2006" "Kourtz 1967" "Nash and Johnson 1996" ...
##  $ Publication_type  : chr  "Proceeding" "Proceeding" "Report" "Paper" ...
##  $ Study_area        : chr  "Ticino" "Aosta Valley" "Canada" "Alberta and Saskatchewan" ...
##  $ Country           : chr  "Switzerland" "Italy" "Canada" "Canada" ...
##  $ ISO_code          : chr  "CH-TI" "IT-23" "CA" "CA-AB; CA-SK" ...
##  $ Spatial_scale     : chr  "Local" "Local" "Continental" "Regional" ...
##  $ Biome             : chr  "Temperate coniferous forests" "Temperate coniferous forests" "Boreal forests" "Boreal forests" ...
##  $ Ecozone           : chr  "Palearctic" "Palearctic" "Nearctic" "Nearctic" ...
##  $ Climate_class     : chr  "Dfb: cold; no dry season; warm summer " "Dfb: cold; no dry season; warm summer " "Dfc: cold; no dry season; cold summer" "Dfc: cold; no dry season; cold summer" ...
##  $ Start_year        : int  1981 2003 1960 1988 1992 2012 2017 2000 2004 2009 ...
##  $ End_year          : int  2004 2003 1963 1993 2001 2015 2017 2009 2009 2014 ...
##  $ Length_year       : int  24 1 4 4 10 4 1 10 6 6 ...
##  $ Min_time_h        : num  24 24 4 24 24 24 24 24 0.1 0.02 ...
##  $ Max_time_h        : int  24 24 24 24 24 24 24 2088 24 24 ...
##  $ Number_fires      : int  154 25 3615 2551 5169 797 95 1797 464 357 ...
##  $ Number_records    : int  7 6 16 15 28 15 11 4 24 19 ...
##  $ Fire_detection    : chr  "Fire database" "Fire database" "Fire database" "Fire database" ...
##  $ Fire_data_source  : chr  "Swissfire" "Aosta Valley wildfire database" "Survey of lightning-caused forest fire reports of Canada" "Alberta wildfire database; Saskatchewan wildfire database" ...
##  $ LLS               : chr  "" "" "" "LLP" ...
##  $ Lightning_level   : chr  "" "" "" "Flash" ...
##  $ DE_pct            : chr  "" "" "" "90" ...
##  $ LA_km             : chr  "" "" "" "5-10" ...
##  $ Method            : chr  "Storm time" "Storm time" "Storm time" "Lightning match" ...
##  $ Buffer_distance_km: chr  "" "" "" "5" ...
##  $ Temporal_window_d : chr  "" "" "" "15" ...
##  $ Max_holdover_d    : chr  "7" "6" "> 11" "15" ...
##  $ Selection_criteria: chr  "" "" "" "Minimum holdover time" ...
##  $ Dataset           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Data_collector    : chr  "Jose V. Moris" "Jose V. Moris" "Jose V. Moris" "Jose V. Moris" ...
##  $ Date_entry        : chr  "3/4/2021" "3/4/2021" "3/4/2021" "10/4/2021" ...
##  $ Data_check        : chr  "Yes" "Yes" "No" "No" ...
##  $ Comments          : chr  "Data provided by G.B. Pezzatti and M. Conedera" "Data provided by G.B. Pezzatti and M. Conedera" "" "" ...
dim(AD)
## [1] 42 33
# filtering
fd_it_es <- AD %>% dplyr::filter(Country %in% c("Italy", "Spain")) # studies from Italy and Spain

fd_y2000 <- AD %>% dplyr::filter(Start_year >= 2000) # studies from 2000 onwards

fd_200fires <- AD %>% dplyr::filter(Number_fires >= 200) # studies with at least 200 fires

fd_max_prox <- AD %>% dplyr::filter(Selection_criteria == "Maximum proximity index") #  studies that used the maximum proximity index

fd_boreal_5y <- AD %>% dplyr::filter(Biome == "Boreal forests" & # studies from boreal regions and at least 5 years of duration
                                     Length_year >= 5)


Data manipulation

Censored, non-censored, and ancillary data can be combined to perform more analyses. We are including here a couple of examples.

cd_country <- CD %>% left_join(AD %>% select(Study_id, Country), # add country information to censored data
                             by = "Study_id")

bp_ch <-  cd_country %>% filter(Country == "Switzerland") %>% # filter studies from Switzerland
  mutate(day = cut(Upper_limit_d, # calculate daily relative frequency
                   seq(0, max(Upper_limit_d, na.rm = T) +1))) %>%
  group_by(Study_id, day) %>% 
  summarize(RF = sum(RF))

bp_ch %>% group_by(Study_id) %>%
  summarize(check = sum(RF)) # the values must be 1
## # A tibble: 2 x 2
##   Study_id    check
##   <chr>       <dbl>
## 1 CON2006CH01  1.00
## 2 MOR2020CH01  1.00
ggplot(bp_ch, aes(x = day, y = RF*100)) + # barplot of Swiss distributions
  geom_bar(stat="identity") +
  labs(x = "Holdover time (days)", y = "Relative frequency of LIW (%)") +
  geom_text(aes(label = round(RF*100, 1), vjust = -0.25)) +
  ylim(0,80) +
  facet_wrap(~Study_id, ncol = 2)

ad <- AD %>% left_join(cd_sta, by = "Study_id") # add statistics calculated from censored data to ancillary data 

ad$Biome <- factor(ad$Biome, # reorder biomes 
                  levels = c("Boreal forests",
                             "Temperate coniferous forests",
                             "Temperate broadleaf and mixed forests",
                             "Mediterranean forests",
                             "Flooded grasslands and savannas"))

ggplot(ad, aes(x=Biome, y=median_h)) + # barplot of median HOT by biome
  geom_boxplot() +
  stat_boxplot(geom ="errorbar", width = 0.25) +
  geom_point(color="black", shape=16) +
  labs(x = "Biome",
       y = "Median holdover time (hours)") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 20))

# end of R script