Introduction

Main content includes:

  1. Part 1: Data Collection
  2. Part 2: Data processing
  3. Part 3: Data analysis

Part 1: Data Collection

1.1 Preparation

rm(list = ls(all.names = TRUE))

1.2 Collection data

PM 2.5 data is collected from multiple sources as Table 1

Table 1. Information about data sources for Air Pollutant - Particulate Matter (PM2.5)

Station_name Station type Location Source
US embassy & consulate Urban Jarkata, Indonesia https://www.airnow.gov/international/us-embassies-and-consulates/#Indonesia
US embassy & consulate Urban Vietiane, Laos https://www.airnow.gov/international/us-embassies-and-consulates/#Laos
US embassy & consulate Urban Rangoon, Myanmar https://www.airnow.gov/international/us-embassies-and-consulates/#Burma
US embassy & consulate Urban Ho Chi Minh, Vietnam https://www.airnow.gov/international/us-embassies-and-consulates/#Vietnam
US embassy & consulate Urban Ha Noi, Vietnam https://www.airnow.gov/international/us-embassies-and-consulates/#Vietnam$Hanoi
5 stations of Singapore pollution control department Urban Singapore, Singapore https://data.gov.sg/dataset/historical-1-hr-pm2-5

Note: PM2.5 concentration from the Historical regional 1-hr PM2.5 value data set accessed on 1 February 2022 from Singapore National Environment Agency which is made available under the terms of the Singapore Open Data Licence version 1.0: https://data.gov.sg/dataset/historical-1-hr-pm2-5

Because data from different sources have different structures, I use different step to collect data from US embassy consulate and data collected from other sites (Singapore Pollution Control Department)

The source of meteorological parameters (wind speed/direction, temperature, pressure, and relative humidity) were from the Global Surface Summary of the Day (GSOD) stations extrated from NOAA Integrated Surface Database (12- melevation) as Table 2

Table 2. Information about data sources for Meteorology data from GSOD stations:

**Station/_name** Location **GSOD/_ID**
Soekarno Hatta Intl AP Jarkata, Indonesia 967490-99999
Wattay Vietiane, Laos 489400-99999
Mingaladon Rangoon, Myanmar 480960-99999
Tan Son Nhat Intl AP Ho Chi Minh, Vietnam 488200-99999
Hanoi Noibai Intl AP Ha Noi, Vietnam 489000-99999
Changi Airport Singapore, Singapore 486980-99999

Collecting data from US embassy & consulate and meteorology data from NOAA

#create a list including name of stations
list <- c("Hanoi"," HoChiMinhCity", "JakartaCentral","Rangoon","Vientiane")

#Create a list including time zone for each geography location
timezone <- c("Asia/Ho_Chi_Minh","Asia/Ho_Chi_Minh","Asia/Jakarta","Asia/Ho_Chi_Minh","Asia/Vientiane") 

#create a list including GSOD station code id for generating meteorology data from NOAA stations
stationid <- c("488200-99999","489000-99999","967490-99999","480960-99999","489400-99999")

Create a loop to merge data including Meteorology variables (NOAA stations) and PM2.5 data (US embassy and consulate stations)

airnow_sea <- data.frame()
for (i in 1:5){
  for (y in 2016:2021) {
    met <- importNOAA(code = stationid[i], year = y)
    if (is.null(met)){
    }else{
    airnow <- read.csv(paste0("http://dosairnowdata.org/dos/historical/",list[i],"/",y,"/",list[i],"_PM2.5_",y,"_YTD.csv"),
                              header = T)
    airnow$Date..LT. <- as.POSIXct(airnow$Date..LT.,tz= timezone[i],format ="%Y-%m-%d %H:%M %p")
    attr(airnow$Date..LT., "tzone") <- "Asia/Ho_Chi_Minh"
    airnow <- as.data.frame(airnow)
    attr(met$date, "tzone") <- "Asia/Ho_Chi_Minh"
    airnow <- merge(airnow, met %>% select("date","ws","wd","air_temp","atmos_pres","visibility","RH"), by.x = "Date..LT.", by.y = "date", all = TRUE)
    attr(airnow$Date..LT., "tzone") <- "Asia/Ho_Chi_Minh"
    airnow_sea <- bind_rows(airnow_sea, airnow)
    }
  }
}

names(airnow_sea)[1:2] <- c("date","station")
names(airnow_sea)[8] <- "PM2.5"
save.image(file ="~/DIME RA/Data construction/airnow.RData")

Overlook data collected

##       date                       station           Parameter        
##  Min.   :2016-01-01 01:00:00   Length:359589      Length:359589     
##  1st Qu.:2017-11-12 01:00:00   Class :character   Class :character  
##  Median :2019-06-28 02:00:00   Mode  :character   Mode  :character  
##  Mean   :2019-03-29 14:23:38                                        
##  3rd Qu.:2020-08-12 21:00:00                                        
##  Max.   :2022-01-01 12:00:00                                        
##                                                                     
##       Year            Month             Day              Hour       
##  Min.   :2016     Min.   : 1.00    Min.   : 1.00    Min.   : 0.00   
##  1st Qu.:2018     1st Qu.: 4.00    1st Qu.: 8.00    1st Qu.: 6.00   
##  Median :2019     Median : 7.00    Median :16.00    Median :12.00   
##  Mean   :2019     Mean   : 6.87    Mean   :15.82    Mean   :11.51   
##  3rd Qu.:2020     3rd Qu.:10.00    3rd Qu.:23.00    3rd Qu.:18.00   
##  Max.   :2022     Max.   :12.00    Max.   :31.00    Max.   :23.00   
##  NA's   :143038   NA's   :143038   NA's   :143038   NA's   :143038  
##      PM2.5              AQI          AQI.Category         Raw.Conc.      
##  Min.   :-999.00   Min.   :-999.00   Length:359589      Min.   :-999.00  
##  1st Qu.:  13.30   1st Qu.:  54.00   Class :character   1st Qu.:  13.00  
##  Median :  24.80   Median :  78.00   Mode  :character   Median :  25.00  
##  Mean   :   8.55   Mean   :  52.88                      Mean   :  27.27  
##  3rd Qu.:  41.60   3rd Qu.: 116.00                      3rd Qu.:  42.00  
##  Max.   : 865.30   Max.   : 741.00                      Max.   : 985.00  
##  NA's   :143038    NA's   :143038                       NA's   :143038   
##   Conc..Unit          Duration           QC.Name                ws       
##  Length:359589      Length:359589      Length:359589      Min.   : 0.00  
##  Class :character   Class :character   Class :character   1st Qu.: 1.50  
##  Mode  :character   Mode  :character   Mode  :character   Median : 2.35  
##                                                           Mean   : 2.58  
##                                                           3rd Qu.: 3.35  
##                                                           Max.   :49.40  
##                                                           NA's   :74253  
##        wd           air_temp       atmos_pres       visibility    
##  Min.   :  0     Min.   : 5.5    Min.   : 992.7   Min.   :   50   
##  1st Qu.: 85     1st Qu.:25.0    1st Qu.:1007.6   1st Qu.: 5000   
##  Median :160     Median :27.0    Median :1009.8   Median : 6500   
##  Mean   :172     Mean   :27.0    Mean   :1009.4   Mean   : 6372   
##  3rd Qu.:255     3rd Qu.:30.0    3rd Qu.:1011.5   3rd Qu.: 8000   
##  Max.   :360     Max.   :41.0    Max.   :1048.1   Max.   :50000   
##  NA's   :99920   NA's   :74274   NA's   :310386   NA's   :159775  
##        RH        
##  Min.   : 15.39  
##  1st Qu.: 66.32  
##  Median : 79.36  
##  Mean   : 77.58  
##  3rd Qu.: 90.85  
##  Max.   :100.00  
##  NA's   :74306

Collecting Singapore data

Collecting PM2.5 data from the Sing EPA website [https://data.gov.sg/dataset/historical-1-hr-pm2-5]:

Sing_pm2.5 <- read_xlsx('~/DIME RA/Raw Data/Sing-historical-1-hr-pm2-5-refined.xlsx')
head(Sing_pm2.5)
## # A tibble: 6 x 6
##   `1hr_pm2.5`         north south  east  west central
##   <dttm>              <dbl> <dbl> <dbl> <dbl>   <dbl>
## 1 2014-04-01 01:00:00     7     6     7    23       8
## 2 2014-04-01 02:00:00    10    10    10    36      10
## 3 2014-04-01 03:00:00    15    20    15    31      15
## 4 2014-04-01 04:00:00    21    14    27    36      12
## 5 2014-04-01 05:00:00    28    12    28    35      22
## 6 2014-04-01 06:00:00    28    31    21    33      24
names(Sing_pm2.5)[1] <- "date" #label the heading for consistent with other data
Sing_pm2.5$date <- as.POSIXct(Sing_pm2.5$date, tz="Asia/Singapore", format="%Y-%m-%d %H:%M:%S")
Sing_pm2.5$pm2_5 <- apply(Sing_pm2.5[,c(2,3,4,5,6)],1,mean, na.rm=TRUE) # define the average PM2.5 value of 5 stations by time
head(Sing_pm2.5)
## # A tibble: 6 x 7
##   date                north south  east  west central pm2_5
##   <dttm>              <dbl> <dbl> <dbl> <dbl>   <dbl> <dbl>
## 1 2014-04-01 01:00:00     7     6     7    23       8  10.2
## 2 2014-04-01 02:00:00    10    10    10    36      10  15.2
## 3 2014-04-01 03:00:00    15    20    15    31      15  19.2
## 4 2014-04-01 04:00:00    21    14    27    36      12  22  
## 5 2014-04-01 05:00:00    28    12    28    35      22  25  
## 6 2014-04-01 06:00:00    28    31    21    33      24  27.4
save.image(file="~/DIME RA/Data construction/Sing_PM25.RData")

Collecting meteorology data from NOAA and merge data with PM2.5 Data

Sing_met <- importNOAA(code="486980-99999",year=2016:2020)
attr(Sing_met$date,"tzone") <- "Asia/Singapore"

load("R-analysis/Sing_PM25.RData")
Sing <- merge(Sing_pm2.5 %>% select("date","pm2_5"),Sing_met %>% select("date","ws","wd","air_temp","atmos_pres","visibility","RH"),by="date") %>% mutate(station="Singapore")
attr(Sing$date, 'tzone') <- "Asia/Ho_Chi_Minh"
names(Sing)[2] <- "PM2.5"

save.image("~/DIME RA/Data construction/Sing.RData")
##                  date PM2.5       ws       wd air_temp atmos_pres visibility
## 1 2016-01-01 08:00:00   4.4 2.850000 23.68109 28.50000         NA         NA
## 2 2016-01-01 09:00:00   7.2 3.350000 25.37408 28.50000         NA         NA
## 3 2016-01-01 10:00:00   7.2 5.300000 36.41841 29.50000     1014.2      14000
## 4 2016-01-01 11:00:00   7.6 4.100000 42.46258 28.00000         NA         NA
## 5 2016-01-01 12:00:00   7.4 5.900000 45.67966 29.00000         NA         NA
## 6 2016-01-01 13:00:00   5.0 6.033333 40.00000 29.03333     1012.9      14000
##         RH   station
## 1 81.41760 Singapore
## 2 81.41760 Singapore
## 3 75.12584 Singapore
## 4 83.78904 Singapore
## 5 76.74897 Singapore
## 6 71.55129 Singapore

Compile data from 2 data sets (Sing EPA, Airnow) into one

SEA <- bind_rows(airnow_sea,Sing)
save.image("~/DIME RA/Data construction/SEA.RData")
##       date                       station           Parameter        
##  Min.   :2016-01-01 01:00:00   Length:403436      Length:403436     
##  1st Qu.:2017-10-08 11:45:00   Class :character   Class :character  
##  Median :2019-05-28 10:00:00   Mode  :character   Mode  :character  
##  Mean   :2019-02-28 06:38:18                                        
##  3rd Qu.:2020-07-04 21:15:00                                        
##  Max.   :2022-01-01 12:00:00                                        
##                                                                     
##       Year            Month             Day              Hour       
##  Min.   :2016     Min.   : 1.00    Min.   : 1.00    Min.   : 0.00   
##  1st Qu.:2018     1st Qu.: 4.00    1st Qu.: 8.00    1st Qu.: 6.00   
##  Median :2019     Median : 7.00    Median :16.00    Median :12.00   
##  Mean   :2019     Mean   : 6.87    Mean   :15.82    Mean   :11.51   
##  3rd Qu.:2020     3rd Qu.:10.00    3rd Qu.:23.00    3rd Qu.:18.00   
##  Max.   :2022     Max.   :12.00    Max.   :31.00    Max.   :23.00   
##  NA's   :186885   NA's   :186885   NA's   :186885   NA's   :186885  
##      PM2.5              AQI          AQI.Category         Raw.Conc.      
##  Min.   :-999.00   Min.   :-999.00   Length:403436      Min.   :-999.00  
##  1st Qu.:  11.60   1st Qu.:  54.00   Class :character   1st Qu.:  13.00  
##  Median :  21.30   Median :  78.00   Mode  :character   Median :  25.00  
##  Mean   :   9.51   Mean   :  52.88                      Mean   :  27.27  
##  3rd Qu.:  37.50   3rd Qu.: 116.00                      3rd Qu.:  42.00  
##  Max.   : 865.30   Max.   : 741.00                      Max.   : 985.00  
##  NA's   :143038    NA's   :186885                       NA's   :186885   
##   Conc..Unit          Duration           QC.Name                ws       
##  Length:403436      Length:403436      Length:403436      Min.   : 0.00  
##  Class :character   Class :character   Class :character   1st Qu.: 1.50  
##  Mode  :character   Mode  :character   Mode  :character   Median : 2.35  
##                                                           Mean   : 2.57  
##                                                           3rd Qu.: 3.35  
##                                                           Max.   :49.40  
##                                                           NA's   :74366  
##        wd            air_temp       atmos_pres       visibility    
##  Min.   :  0.0    Min.   : 5.50   Min.   : 992.7   Min.   :   50   
##  1st Qu.: 80.0    1st Qu.:25.20   1st Qu.:1008.1   1st Qu.: 5000   
##  Median :160.0    Median :27.00   Median :1009.9   Median : 7000   
##  Mean   :170.6    Mean   :27.15   Mean   :1009.6   Mean   : 6810   
##  3rd Qu.:255.0    3rd Qu.:30.00   3rd Qu.:1011.5   3rd Qu.: 8000   
##  Max.   :360.0    Max.   :41.00   Max.   :1048.1   Max.   :50000   
##  NA's   :104524   NA's   :74387   NA's   :339663   NA's   :183293  
##        RH        
##  Min.   : 15.39  
##  1st Qu.: 67.39  
##  Median : 79.79  
##  Mean   : 77.84  
##  3rd Qu.: 88.98  
##  Max.   :100.00  
##  NA's   :74419

Part 2: Data Processing

Remove invalid data

SEA_clean <- SEA %>% filter(PM2.5 >0) #identify data 

**Reshaping data for analysing PM2.5 trend

pm2.5 <- SEA_clean %>% select(c("station","date","PM2.5")) %>%  
  pivot_wider(names_from= station, values_from = PM2.5, values_fn =list(PM2.5 = mean))

save.image("~/DIME RA/Data construction/SEA_cl.RData")
##       date                       station           Parameter        
##  Min.   :2016-01-01 01:00:00   Length:251573      Length:251573     
##  1st Qu.:2017-11-10 10:00:00   Class :character   Class :character  
##  Median :2019-07-19 08:00:00   Mode  :character   Mode  :character  
##  Mean   :2019-03-30 01:40:39                                        
##  3rd Qu.:2020-07-18 11:00:00                                        
##  Max.   :2022-01-01 12:00:00                                        
##                                                                     
##       Year           Month            Day             Hour      
##  Min.   :2016    Min.   : 1.00   Min.   : 1.00   Min.   : 0.00  
##  1st Qu.:2018    1st Qu.: 4.00   1st Qu.: 8.00   1st Qu.: 6.00  
##  Median :2019    Median : 7.00   Median :16.00   Median :12.00  
##  Mean   :2019    Mean   : 6.88   Mean   :15.84   Mean   :11.55  
##  3rd Qu.:2020    3rd Qu.:10.00   3rd Qu.:23.00   3rd Qu.:18.00  
##  Max.   :2022    Max.   :12.00   Max.   :31.00   Max.   :23.00  
##  NA's   :43847   NA's   :43847   NA's   :43847   NA's   :43847  
##      PM2.5             AQI         AQI.Category         Raw.Conc.      
##  Min.   :  0.10   Min.   :  0.00   Length:251573      Min.   :-999.00  
##  1st Qu.: 12.50   1st Qu.: 56.00   Class :character   1st Qu.:  14.00  
##  Median : 22.00   Median : 80.00   Mode  :character   Median :  25.00  
##  Mean   : 29.02   Mean   : 87.27                      Mean   :  29.81  
##  3rd Qu.: 38.30   3rd Qu.:118.00                      3rd Qu.:  43.00  
##  Max.   :865.30   Max.   :741.00                      Max.   : 985.00  
##                   NA's   :43847                       NA's   :43847    
##   Conc..Unit          Duration           QC.Name                ws       
##  Length:251573      Length:251573      Length:251573      Min.   : 0.00  
##  Class :character   Class :character   Class :character   1st Qu.: 1.50  
##  Mode  :character   Mode  :character   Mode  :character   Median : 2.23  
##                                                           Mean   : 2.47  
##                                                           3rd Qu.: 3.35  
##                                                           Max.   :49.40  
##                                                           NA's   :46415  
##        wd           air_temp       atmos_pres       visibility    
##  Min.   :  0.0   Min.   : 5.50   Min.   : 997.6   Min.   :   50   
##  1st Qu.: 75.0   1st Qu.:25.00   1st Qu.:1009.0   1st Qu.: 5000   
##  Median :164.6   Median :27.00   Median :1010.4   Median : 7000   
##  Mean   :169.3   Mean   :26.81   Mean   :1010.3   Mean   : 6968   
##  3rd Qu.:255.0   3rd Qu.:29.50   3rd Qu.:1011.7   3rd Qu.: 8000   
##  Max.   :360.0   Max.   :40.00   Max.   :1023.5   Max.   :22000   
##  NA's   :69321   NA's   :46427   NA's   :212412   NA's   :117739  
##        RH        
##  Min.   : 18.34  
##  1st Qu.: 70.46  
##  Median : 83.49  
##  Mean   : 80.17  
##  3rd Qu.: 91.49  
##  Max.   :100.00  
##  NA's   :46441

Pre-analysis for flagging data issues

load("~/DIME RA/Data construction/SEA_cl.RData")

timePlot(pm2.5, avg.time = "day", 
         pollutant = c("Hanoi","Ho Chi Minh City","Vientiane","Rangoon"),
         cols = c("#CA0020","orange","black","#76d6c8","purple","#4DAC26"),
         lty = 1, key.columns = 2,lwd=1,
         ylab=expression("Daily PM"[2.5]*" Concentration ("~mu~"g/m"^3*")"),
         group = TRUE)

timePlot(pm2.5, avg.time = "day", 
         pollutant = c("Jakarta Central","Singapore"),
         ylab = expression("Daily PM"[2.5]*" Concentration ("~mu~"g/m"^3*")"),
         cols = c("#F4A582","#404040"),
         lty = 1, key.columns = 2,lwd=1,
         group = TRUE)

Some flagged issues; * Hanoi missing value in 2019 * PM2.5 data in Vientaine and Rangoon are not available in 2016 - 2019

Part 3: Data Analysis

SEA_clean_by_day <- timeAverage(SEA_clean, avg.time = "day", type = "station") #to observe the trend of Haze by daily PM2.5 concentration
save.image("DIME RA/~/DIME RA/Data construction/SEA_Cl_day.RData")
load("~/DIME RA/Data construction/SEA_Cl_day.RData")

SEA_clean_by_day$date <- as.Date(SEA_clean_by_day$date)
haze_summary <- ggplot(SEA_clean_by_day %>%filter(RH < 90 & visibility < 5000), aes(x= date, y=PM2.5, color= station,shape=station,fill=station)) +
                        geom_point(size=3,alpha=0.8,stroke=1)+ 
                        labs(y= expression('Daily PM'[2.5]*' concentration ('~mu~'g/m'^3*')'))+
                        scale_x_date(date_breaks="6 months", date_labels = "%b-%y")+
                        scale_y_continuous(expand = c(0,0))

haze_summary <- haze_summary+theme(axis.line = element_line(color = 'grey50'),
                                   panel.grid.major.x = element_line(color = '#d5dce0', size = 10),
                                   panel.grid.major.y = element_blank(),
                                   panel.grid.minor= element_blank(),
                                   panel.background = element_rect(fill="#F7F7F7"),
                                   legend.position = "bottom")
haze_summary +  geom_hline(yintercept = 50, lty = 2, lwd = 1) +
                scale_color_manual(values=c(rep("#CA0020",2), rep("#018571",2),"#B8E186","#4DAC26","#F4A582","#404040"))+
                scale_shape_manual(values=c(16:21,rep(1,2)))+
                geom_ribbon(aes(ymin=0,ymax=50),fill="pink",alpha=0.1)

PM2.5 concentration based on visibility and Relative Humidity

load("~/DIME RA/Data construction/SEA_Cl_day.RData")


vis <- scatterPlot(SEA_clean_by_day %>% filter(RH <90),
            avg.time = "day",
            x="visibility", xlab = "Visibility (m)",
            y="PM2.5", ylab = "Daily concentration PM2.5 (µg/m3)", 
            z="RH", type ="station", cols = "jet",
            main = expression("Daily PM"[2.5]*" vs Visibilty based on Relative Humidily in 2016 - 2021"))

trellis.last.object() +
  latticeExtra::layer(panel.abline(h = 50, v=5000, lty =2, lwd = 2))