Main content includes:
- Part 1: Data Collection
- Part 2: Data processing
- Part 3: Data analysis
rm(list = ls(all.names = TRUE))
PM 2.5 data is collected from multiple sources as Table 1
Station_name | Station type | Location | Source |
---|---|---|---|
US embassy & consulate | Urban | Jarkata, Indonesia | https://www.airnow.gov/international/us-embassies-and-consulates/#Indonesia |
US embassy & consulate | Urban | Vietiane, Laos | https://www.airnow.gov/international/us-embassies-and-consulates/#Laos |
US embassy & consulate | Urban | Rangoon, Myanmar | https://www.airnow.gov/international/us-embassies-and-consulates/#Burma |
US embassy & consulate | Urban | Ho Chi Minh, Vietnam | https://www.airnow.gov/international/us-embassies-and-consulates/#Vietnam |
US embassy & consulate | Urban | Ha Noi, Vietnam | https://www.airnow.gov/international/us-embassies-and-consulates/#Vietnam$Hanoi |
5 stations of Singapore pollution control department | Urban | Singapore, Singapore | https://data.gov.sg/dataset/historical-1-hr-pm2-5 |
Note: PM2.5 concentration from the Historical regional 1-hr PM2.5 value data set accessed on 1 February 2022 from Singapore National Environment Agency which is made available under the terms of the Singapore Open Data Licence version 1.0: https://data.gov.sg/dataset/historical-1-hr-pm2-5
Because data from different sources have different structures, I use different step to collect data from US embassy consulate and data collected from other sites (Singapore Pollution Control Department)
The source of meteorological parameters (wind speed/direction, temperature, pressure, and relative humidity) were from the Global Surface Summary of the Day (GSOD) stations extrated from NOAA Integrated Surface Database (12- melevation) as Table 2
**Station/_name** | Location | **GSOD/_ID** |
---|---|---|
Soekarno Hatta Intl AP | Jarkata, Indonesia | 967490-99999 |
Wattay | Vietiane, Laos | 489400-99999 |
Mingaladon | Rangoon, Myanmar | 480960-99999 |
Tan Son Nhat Intl AP | Ho Chi Minh, Vietnam | 488200-99999 |
Hanoi Noibai Intl AP | Ha Noi, Vietnam | 489000-99999 |
Changi Airport | Singapore, Singapore | 486980-99999 |
#create a list including name of stations
list <- c("Hanoi"," HoChiMinhCity", "JakartaCentral","Rangoon","Vientiane")
#Create a list including time zone for each geography location
timezone <- c("Asia/Ho_Chi_Minh","Asia/Ho_Chi_Minh","Asia/Jakarta","Asia/Ho_Chi_Minh","Asia/Vientiane")
#create a list including GSOD station code id for generating meteorology data from NOAA stations
stationid <- c("488200-99999","489000-99999","967490-99999","480960-99999","489400-99999")
Create a loop to merge data including Meteorology variables (NOAA stations) and PM2.5 data (US embassy and consulate stations)
airnow_sea <- data.frame()
for (i in 1:5){
for (y in 2016:2021) {
met <- importNOAA(code = stationid[i], year = y)
if (is.null(met)){
}else{
airnow <- read.csv(paste0("http://dosairnowdata.org/dos/historical/",list[i],"/",y,"/",list[i],"_PM2.5_",y,"_YTD.csv"),
header = T)
airnow$Date..LT. <- as.POSIXct(airnow$Date..LT.,tz= timezone[i],format ="%Y-%m-%d %H:%M %p")
attr(airnow$Date..LT., "tzone") <- "Asia/Ho_Chi_Minh"
airnow <- as.data.frame(airnow)
attr(met$date, "tzone") <- "Asia/Ho_Chi_Minh"
airnow <- merge(airnow, met %>% select("date","ws","wd","air_temp","atmos_pres","visibility","RH"), by.x = "Date..LT.", by.y = "date", all = TRUE)
attr(airnow$Date..LT., "tzone") <- "Asia/Ho_Chi_Minh"
airnow_sea <- bind_rows(airnow_sea, airnow)
}
}
}
names(airnow_sea)[1:2] <- c("date","station")
names(airnow_sea)[8] <- "PM2.5"
save.image(file ="~/DIME RA/Data construction/airnow.RData")
Overlook data collected
## date station Parameter
## Min. :2016-01-01 01:00:00 Length:359589 Length:359589
## 1st Qu.:2017-11-12 01:00:00 Class :character Class :character
## Median :2019-06-28 02:00:00 Mode :character Mode :character
## Mean :2019-03-29 14:23:38
## 3rd Qu.:2020-08-12 21:00:00
## Max. :2022-01-01 12:00:00
##
## Year Month Day Hour
## Min. :2016 Min. : 1.00 Min. : 1.00 Min. : 0.00
## 1st Qu.:2018 1st Qu.: 4.00 1st Qu.: 8.00 1st Qu.: 6.00
## Median :2019 Median : 7.00 Median :16.00 Median :12.00
## Mean :2019 Mean : 6.87 Mean :15.82 Mean :11.51
## 3rd Qu.:2020 3rd Qu.:10.00 3rd Qu.:23.00 3rd Qu.:18.00
## Max. :2022 Max. :12.00 Max. :31.00 Max. :23.00
## NA's :143038 NA's :143038 NA's :143038 NA's :143038
## PM2.5 AQI AQI.Category Raw.Conc.
## Min. :-999.00 Min. :-999.00 Length:359589 Min. :-999.00
## 1st Qu.: 13.30 1st Qu.: 54.00 Class :character 1st Qu.: 13.00
## Median : 24.80 Median : 78.00 Mode :character Median : 25.00
## Mean : 8.55 Mean : 52.88 Mean : 27.27
## 3rd Qu.: 41.60 3rd Qu.: 116.00 3rd Qu.: 42.00
## Max. : 865.30 Max. : 741.00 Max. : 985.00
## NA's :143038 NA's :143038 NA's :143038
## Conc..Unit Duration QC.Name ws
## Length:359589 Length:359589 Length:359589 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 1.50
## Mode :character Mode :character Mode :character Median : 2.35
## Mean : 2.58
## 3rd Qu.: 3.35
## Max. :49.40
## NA's :74253
## wd air_temp atmos_pres visibility
## Min. : 0 Min. : 5.5 Min. : 992.7 Min. : 50
## 1st Qu.: 85 1st Qu.:25.0 1st Qu.:1007.6 1st Qu.: 5000
## Median :160 Median :27.0 Median :1009.8 Median : 6500
## Mean :172 Mean :27.0 Mean :1009.4 Mean : 6372
## 3rd Qu.:255 3rd Qu.:30.0 3rd Qu.:1011.5 3rd Qu.: 8000
## Max. :360 Max. :41.0 Max. :1048.1 Max. :50000
## NA's :99920 NA's :74274 NA's :310386 NA's :159775
## RH
## Min. : 15.39
## 1st Qu.: 66.32
## Median : 79.36
## Mean : 77.58
## 3rd Qu.: 90.85
## Max. :100.00
## NA's :74306
Collecting PM2.5 data from the Sing EPA website [https://data.gov.sg/dataset/historical-1-hr-pm2-5]:
Sing_pm2.5 <- read_xlsx('~/DIME RA/Raw Data/Sing-historical-1-hr-pm2-5-refined.xlsx')
head(Sing_pm2.5)
## # A tibble: 6 x 6
## `1hr_pm2.5` north south east west central
## <dttm> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2014-04-01 01:00:00 7 6 7 23 8
## 2 2014-04-01 02:00:00 10 10 10 36 10
## 3 2014-04-01 03:00:00 15 20 15 31 15
## 4 2014-04-01 04:00:00 21 14 27 36 12
## 5 2014-04-01 05:00:00 28 12 28 35 22
## 6 2014-04-01 06:00:00 28 31 21 33 24
names(Sing_pm2.5)[1] <- "date" #label the heading for consistent with other data
Sing_pm2.5$date <- as.POSIXct(Sing_pm2.5$date, tz="Asia/Singapore", format="%Y-%m-%d %H:%M:%S")
Sing_pm2.5$pm2_5 <- apply(Sing_pm2.5[,c(2,3,4,5,6)],1,mean, na.rm=TRUE) # define the average PM2.5 value of 5 stations by time
head(Sing_pm2.5)
## # A tibble: 6 x 7
## date north south east west central pm2_5
## <dttm> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2014-04-01 01:00:00 7 6 7 23 8 10.2
## 2 2014-04-01 02:00:00 10 10 10 36 10 15.2
## 3 2014-04-01 03:00:00 15 20 15 31 15 19.2
## 4 2014-04-01 04:00:00 21 14 27 36 12 22
## 5 2014-04-01 05:00:00 28 12 28 35 22 25
## 6 2014-04-01 06:00:00 28 31 21 33 24 27.4
save.image(file="~/DIME RA/Data construction/Sing_PM25.RData")
Collecting meteorology data from NOAA and merge data with PM2.5 Data
Sing_met <- importNOAA(code="486980-99999",year=2016:2020)
attr(Sing_met$date,"tzone") <- "Asia/Singapore"
load("R-analysis/Sing_PM25.RData")
Sing <- merge(Sing_pm2.5 %>% select("date","pm2_5"),Sing_met %>% select("date","ws","wd","air_temp","atmos_pres","visibility","RH"),by="date") %>% mutate(station="Singapore")
attr(Sing$date, 'tzone') <- "Asia/Ho_Chi_Minh"
names(Sing)[2] <- "PM2.5"
save.image("~/DIME RA/Data construction/Sing.RData")
## date PM2.5 ws wd air_temp atmos_pres visibility
## 1 2016-01-01 08:00:00 4.4 2.850000 23.68109 28.50000 NA NA
## 2 2016-01-01 09:00:00 7.2 3.350000 25.37408 28.50000 NA NA
## 3 2016-01-01 10:00:00 7.2 5.300000 36.41841 29.50000 1014.2 14000
## 4 2016-01-01 11:00:00 7.6 4.100000 42.46258 28.00000 NA NA
## 5 2016-01-01 12:00:00 7.4 5.900000 45.67966 29.00000 NA NA
## 6 2016-01-01 13:00:00 5.0 6.033333 40.00000 29.03333 1012.9 14000
## RH station
## 1 81.41760 Singapore
## 2 81.41760 Singapore
## 3 75.12584 Singapore
## 4 83.78904 Singapore
## 5 76.74897 Singapore
## 6 71.55129 Singapore
Compile data from 2 data sets (Sing EPA, Airnow) into one
SEA <- bind_rows(airnow_sea,Sing)
save.image("~/DIME RA/Data construction/SEA.RData")
## date station Parameter
## Min. :2016-01-01 01:00:00 Length:403436 Length:403436
## 1st Qu.:2017-10-08 11:45:00 Class :character Class :character
## Median :2019-05-28 10:00:00 Mode :character Mode :character
## Mean :2019-02-28 06:38:18
## 3rd Qu.:2020-07-04 21:15:00
## Max. :2022-01-01 12:00:00
##
## Year Month Day Hour
## Min. :2016 Min. : 1.00 Min. : 1.00 Min. : 0.00
## 1st Qu.:2018 1st Qu.: 4.00 1st Qu.: 8.00 1st Qu.: 6.00
## Median :2019 Median : 7.00 Median :16.00 Median :12.00
## Mean :2019 Mean : 6.87 Mean :15.82 Mean :11.51
## 3rd Qu.:2020 3rd Qu.:10.00 3rd Qu.:23.00 3rd Qu.:18.00
## Max. :2022 Max. :12.00 Max. :31.00 Max. :23.00
## NA's :186885 NA's :186885 NA's :186885 NA's :186885
## PM2.5 AQI AQI.Category Raw.Conc.
## Min. :-999.00 Min. :-999.00 Length:403436 Min. :-999.00
## 1st Qu.: 11.60 1st Qu.: 54.00 Class :character 1st Qu.: 13.00
## Median : 21.30 Median : 78.00 Mode :character Median : 25.00
## Mean : 9.51 Mean : 52.88 Mean : 27.27
## 3rd Qu.: 37.50 3rd Qu.: 116.00 3rd Qu.: 42.00
## Max. : 865.30 Max. : 741.00 Max. : 985.00
## NA's :143038 NA's :186885 NA's :186885
## Conc..Unit Duration QC.Name ws
## Length:403436 Length:403436 Length:403436 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 1.50
## Mode :character Mode :character Mode :character Median : 2.35
## Mean : 2.57
## 3rd Qu.: 3.35
## Max. :49.40
## NA's :74366
## wd air_temp atmos_pres visibility
## Min. : 0.0 Min. : 5.50 Min. : 992.7 Min. : 50
## 1st Qu.: 80.0 1st Qu.:25.20 1st Qu.:1008.1 1st Qu.: 5000
## Median :160.0 Median :27.00 Median :1009.9 Median : 7000
## Mean :170.6 Mean :27.15 Mean :1009.6 Mean : 6810
## 3rd Qu.:255.0 3rd Qu.:30.00 3rd Qu.:1011.5 3rd Qu.: 8000
## Max. :360.0 Max. :41.00 Max. :1048.1 Max. :50000
## NA's :104524 NA's :74387 NA's :339663 NA's :183293
## RH
## Min. : 15.39
## 1st Qu.: 67.39
## Median : 79.79
## Mean : 77.84
## 3rd Qu.: 88.98
## Max. :100.00
## NA's :74419
Remove invalid data
SEA_clean <- SEA %>% filter(PM2.5 >0) #identify data
**Reshaping data for analysing PM2.5 trend
pm2.5 <- SEA_clean %>% select(c("station","date","PM2.5")) %>%
pivot_wider(names_from= station, values_from = PM2.5, values_fn =list(PM2.5 = mean))
save.image("~/DIME RA/Data construction/SEA_cl.RData")
## date station Parameter
## Min. :2016-01-01 01:00:00 Length:251573 Length:251573
## 1st Qu.:2017-11-10 10:00:00 Class :character Class :character
## Median :2019-07-19 08:00:00 Mode :character Mode :character
## Mean :2019-03-30 01:40:39
## 3rd Qu.:2020-07-18 11:00:00
## Max. :2022-01-01 12:00:00
##
## Year Month Day Hour
## Min. :2016 Min. : 1.00 Min. : 1.00 Min. : 0.00
## 1st Qu.:2018 1st Qu.: 4.00 1st Qu.: 8.00 1st Qu.: 6.00
## Median :2019 Median : 7.00 Median :16.00 Median :12.00
## Mean :2019 Mean : 6.88 Mean :15.84 Mean :11.55
## 3rd Qu.:2020 3rd Qu.:10.00 3rd Qu.:23.00 3rd Qu.:18.00
## Max. :2022 Max. :12.00 Max. :31.00 Max. :23.00
## NA's :43847 NA's :43847 NA's :43847 NA's :43847
## PM2.5 AQI AQI.Category Raw.Conc.
## Min. : 0.10 Min. : 0.00 Length:251573 Min. :-999.00
## 1st Qu.: 12.50 1st Qu.: 56.00 Class :character 1st Qu.: 14.00
## Median : 22.00 Median : 80.00 Mode :character Median : 25.00
## Mean : 29.02 Mean : 87.27 Mean : 29.81
## 3rd Qu.: 38.30 3rd Qu.:118.00 3rd Qu.: 43.00
## Max. :865.30 Max. :741.00 Max. : 985.00
## NA's :43847 NA's :43847
## Conc..Unit Duration QC.Name ws
## Length:251573 Length:251573 Length:251573 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 1.50
## Mode :character Mode :character Mode :character Median : 2.23
## Mean : 2.47
## 3rd Qu.: 3.35
## Max. :49.40
## NA's :46415
## wd air_temp atmos_pres visibility
## Min. : 0.0 Min. : 5.50 Min. : 997.6 Min. : 50
## 1st Qu.: 75.0 1st Qu.:25.00 1st Qu.:1009.0 1st Qu.: 5000
## Median :164.6 Median :27.00 Median :1010.4 Median : 7000
## Mean :169.3 Mean :26.81 Mean :1010.3 Mean : 6968
## 3rd Qu.:255.0 3rd Qu.:29.50 3rd Qu.:1011.7 3rd Qu.: 8000
## Max. :360.0 Max. :40.00 Max. :1023.5 Max. :22000
## NA's :69321 NA's :46427 NA's :212412 NA's :117739
## RH
## Min. : 18.34
## 1st Qu.: 70.46
## Median : 83.49
## Mean : 80.17
## 3rd Qu.: 91.49
## Max. :100.00
## NA's :46441
Pre-analysis for flagging data issues
load("~/DIME RA/Data construction/SEA_cl.RData")
timePlot(pm2.5, avg.time = "day",
pollutant = c("Hanoi","Ho Chi Minh City","Vientiane","Rangoon"),
cols = c("#CA0020","orange","black","#76d6c8","purple","#4DAC26"),
lty = 1, key.columns = 2,lwd=1,
ylab=expression("Daily PM"[2.5]*" Concentration ("~mu~"g/m"^3*")"),
group = TRUE)
timePlot(pm2.5, avg.time = "day",
pollutant = c("Jakarta Central","Singapore"),
ylab = expression("Daily PM"[2.5]*" Concentration ("~mu~"g/m"^3*")"),
cols = c("#F4A582","#404040"),
lty = 1, key.columns = 2,lwd=1,
group = TRUE)
Some flagged issues; * Hanoi missing value in 2019 * PM2.5 data in Vientaine and Rangoon are not available in 2016 - 2019
SEA_clean_by_day <- timeAverage(SEA_clean, avg.time = "day", type = "station") #to observe the trend of Haze by daily PM2.5 concentration
save.image("DIME RA/~/DIME RA/Data construction/SEA_Cl_day.RData")
load("~/DIME RA/Data construction/SEA_Cl_day.RData")
SEA_clean_by_day$date <- as.Date(SEA_clean_by_day$date)
haze_summary <- ggplot(SEA_clean_by_day %>%filter(RH < 90 & visibility < 5000), aes(x= date, y=PM2.5, color= station,shape=station,fill=station)) +
geom_point(size=3,alpha=0.8,stroke=1)+
labs(y= expression('Daily PM'[2.5]*' concentration ('~mu~'g/m'^3*')'))+
scale_x_date(date_breaks="6 months", date_labels = "%b-%y")+
scale_y_continuous(expand = c(0,0))
haze_summary <- haze_summary+theme(axis.line = element_line(color = 'grey50'),
panel.grid.major.x = element_line(color = '#d5dce0', size = 10),
panel.grid.major.y = element_blank(),
panel.grid.minor= element_blank(),
panel.background = element_rect(fill="#F7F7F7"),
legend.position = "bottom")
haze_summary + geom_hline(yintercept = 50, lty = 2, lwd = 1) +
scale_color_manual(values=c(rep("#CA0020",2), rep("#018571",2),"#B8E186","#4DAC26","#F4A582","#404040"))+
scale_shape_manual(values=c(16:21,rep(1,2)))+
geom_ribbon(aes(ymin=0,ymax=50),fill="pink",alpha=0.1)
PM2.5 concentration based on visibility and Relative Humidity
load("~/DIME RA/Data construction/SEA_Cl_day.RData")
vis <- scatterPlot(SEA_clean_by_day %>% filter(RH <90),
avg.time = "day",
x="visibility", xlab = "Visibility (m)",
y="PM2.5", ylab = "Daily concentration PM2.5 (µg/m3)",
z="RH", type ="station", cols = "jet",
main = expression("Daily PM"[2.5]*" vs Visibilty based on Relative Humidily in 2016 - 2021"))
trellis.last.object() +
latticeExtra::layer(panel.abline(h = 50, v=5000, lty =2, lwd = 2))