library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0 ✔ purrr 0.3.0
## ✔ tibble 2.0.1 ✔ dplyr 0.8.0
## ✔ tidyr 0.8.2 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.3.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(janitor)
Hourly Weather data downloaded from http://www.frontierweather.com/historicaldatasubscribers_hourly.html
Daily Weather data downloaded from http://www.frontierweather.com/weatherdatastore.html
houston_weather <- read_delim("weather_data/KIAH_daily.txt", delim = ",")
## Parsed with column specification:
## cols(
## Site4 = col_character(),
## Date = col_character(),
## Source = col_character(),
## `Max Temp` = col_double(),
## `Min Temp` = col_double(),
## `Avg Temp` = col_double(),
## HDDs = col_double(),
## CDDs = col_double(),
## `Precipitation Water Equiv` = col_double(),
## Snowfall = col_double(),
## `Snow/Ice Depth` = col_double()
## )
Site4 : Name of the site
Date : Date when the data was collected
Source : Name of their instrument
Max Temp : Maximum temperature of the day
Min Temp : Minimum temperature of the day
Avg Temp : Average temperature (Max + Min)/2
HDDs : Heating degree day (Temps below 65) https://w1.weather.gov/glossary/index.php?letter=h
CDDs : Cooling degree day (Temps above 65) - A form of degree day used to estimate energy requirements for air conditioning or refrigeration. For example, if a location experiences a mean temperature of 75°F on a certain day, there were 10 CDD (Cooling Degree Days) that day because 75 - 65 = 10.) https://w1.weather.gov/glossary/index.php?letter=c
Precipitation Water Equiv : Rain in inches
Snowfall : Amount of snowfall since 24 hours (https://www.weather.gov/gsp/snow)
Snow/Ice Depth : The depth of the new and old snow remaining on the ground at observation time (https://www.weather.gov/gsp/snow)
??read_delim
houston_weather_df <- houston_weather %>%
mutate(Date=as.Date(Date,format='%m/%d/%Y')) %>%
separate(Date,c("year","month","day")) %>%
janitor::clean_names()
chicago_weather <- read_csv("weather_data/KORD_daily.txt")
## Parsed with column specification:
## cols(
## Site4 = col_character(),
## Date = col_character(),
## Source = col_character(),
## `Max Temp` = col_double(),
## `Min Temp` = col_double(),
## `Avg Temp` = col_double(),
## HDDs = col_double(),
## CDDs = col_double(),
## `Precipitation Water Equiv` = col_double(),
## Snowfall = col_double(),
## `Snow/Ice Depth` = col_double()
## )
Notice we have used different function but without explicitly mentioning delimiter.
chicago_weather_df <- chicago_weather %>%
mutate(Date=as.Date(Date,format='%m/%d/%Y')) %>%
separate(Date,c("year","month","day")) %>%
janitor::clean_names()
seattle_weather_df <- read_csv("weather_data/KSEA_daily.txt") %>%
mutate(Date = as.Date(Date, format='%m/%d/%Y')) %>%
separate(Date, c("year","month","day")) %>%
janitor::clean_names()
## Parsed with column specification:
## cols(
## Site4 = col_character(),
## Date = col_character(),
## Source = col_character(),
## `Max Temp` = col_double(),
## `Min Temp` = col_double(),
## `Avg Temp` = col_double(),
## HDDs = col_double(),
## CDDs = col_double(),
## `Precipitation Water Equiv` = col_double(),
## Snowfall = col_double(),
## `Snow/Ice Depth` = col_double()
## )
houston_weather_df %>%
bind_rows(., chicago_weather_df, seattle_weather_df) %>%
group_by(site4, year) %>%
summarise(mean_temp_year=mean(avg_temp)) %>%
ungroup() %>%
ggplot(.,aes(x = as.numeric(year), y = mean_temp_year, color = site4)) +
geom_line(lwd = 2) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) +
theme_bw()
## Warning: Removed 1 rows containing missing values (geom_path).
Fix the x axis label and the legend??
houston_weather_df %>%
bind_rows(., chicago_weather_df, seattle_weather_df) %>%
ggplot(., aes(x=avg_temp, fill=site4, color=site4)) +
geom_histogram(position="identity", alpha = 0.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 16 rows containing non-finite values (stat_bin).
geom_histogram(position="identity")
## geom_bar: na.rm = FALSE
## stat_bin: binwidth = NULL, bins = NULL, na.rm = FALSE, pad = FALSE
## position_identity
houston_weather_df %>%
bind_rows(., chicago_weather_df, seattle_weather_df) %>%
ggplot(.,aes(avg_temp, color = site4)) +
stat_ecdf(geom = "point")
## Warning: Removed 16 rows containing non-finite values (stat_ecdf).
A CDF is a function such as y=f(x) where y is the probability of the number x, or any lower number, being chosen at random from that distribution.
source: https://www.che.utah.edu/~tony/course/material/Statistics/18_rv_pdf_cdf.php
Here we are talking about ECDF “Empirical” CDF. This means this is not a representation of the entire set (or theoretical distribution) but what was observed and hence the word “empirical”.
stem(houston_weather_df$max_temp)
##
## The decimal point is 1 digit(s) to the right of the |
##
## 2 | 788889
## 3 | 0011112223333333344444444
## 3 | 5555666666666677777777777777888888888888999999999999999
## 4 | 00000000000000000001111111111111111111111222222222222222222222222222+74
## 4 | 55555555555555555555555555555555555555555555555555555556666666666666+210
## 5 | 00000000000000000000000000000000000000000000000000000000000000000000+395
## 5 | 55555555555555555555555555555555555555555555555555555555555555555555+632
## 6 | 00000000000000000000000000000000000000000000000000000000000000000000+850
## 6 | 55555555555555555555555555555555555555555555555555555555555555555555+1193
## 7 | 00000000000000000000000000000000000000000000000000000000000000000000+1615
## 7 | 55555555555555555555555555555555555555555555555555555555555555555555+2158
## 8 | 00000000000000000000000000000000000000000000000000000000000000000000+2383
## 8 | 55555555555555555555555555555555555555555555555555555555555555555555+2507
## 9 | 00000000000000000000000000000000000000000000000000000000000000000000+3016
## 9 | 55555555555555555555555555555555555555555555555555555555555555555555+1743
## 10 | 00000000000000000000000000000000000000000000000000000000000000000000+167
## 10 | 55555566777777899
It can be informative but compared to histogram or ecdf plot, it can be hard to read.
houston_weather_df %>%
bind_rows(chicago_weather_df, seattle_weather_df) %>%
group_by(site4,month) %>%
summarise(max = max(max_temp,na.rm = T),
min = min(min_temp, na.rm = T),
mean = mean(avg_temp, na.rm = T)) %>%
ungroup() %>%
ggplot(., aes(x = month, y = mean)) +
geom_linerange(aes( ymin = min, ymax = max), lwd = 2, color = "#FF5400") +
geom_point(size = 3, color = "#2142A6") +
coord_flip() +
theme_bw(base_size = 14) +
geom_hline(yintercept = 65, linetype = "dashed") +
xlab("Month") +
ylab("Temperature in Farahenit") +
facet_grid(.~site4)
houston_weather_df %>%
bind_rows(chicago_weather_df) %>%
bind_rows(seattle_weather_df) %>%
group_by(site4, month) %>%
summarise(mean_avg_temp = mean(avg_temp,na.rm = T)) %>%
mutate(pct_change = (mean_avg_temp / lag(mean_avg_temp) - 1) * 100) %>%
mutate(pct_change = case_when(
is.na(pct_change) ~ 0,
TRUE ~ as.numeric(pct_change)
)) %>%
arrange(pct_change) %>%
ggplot(.,aes(x = month, y = pct_change, color = site4)) +
geom_line(aes(group = site4), size = 1)+
geom_point() +
theme_bw()
houston_weather_df %>%
bind_rows(chicago_weather_df, seattle_weather_df) %>%
group_by(site4,year) %>%
summarise(sum = sum(precipitation_water_equiv, na.rm = T)) %>%
ungroup() %>%
ggplot(., aes(x = year, y = sum, color = site4) ) +
geom_point() +
geom_line(aes(group = site4), size = 1)+
theme_bw(base_size = 14) +
theme(axis.text.x = element_text(angle = 65, hjust = 1)) +
ylab("Rainfall in inches")
This graph shows total rainfall in inches over the years.
houston_weather_df %>%
bind_rows(chicago_weather_df, seattle_weather_df) %>%
group_by(site4,month) %>%
summarise(max = max(precipitation_water_equiv,na.rm = T),
mean = mean(precipitation_water_equiv, na.rm = T)) %>%
ungroup() %>%
ggplot(. ) +
geom_point(aes(x = month, y = mean,size = max), color = "#2142A6") +
coord_flip() +
theme_bw(base_size = 14) +
xlab("Month") +
ylab("Average rain fall in inches") +
facet_wrap(.~site4)
houston_weather_hourly <- read_csv("weather_data/KIAH.txt") %>%
mutate(date=as.Date(Date,format='%m/%d/%Y')) %>%
separate(date,c("year","month","day")) %>%
janitor::clean_names()
## Parsed with column specification:
## cols(
## Site = col_character(),
## Date = col_character(),
## Hour = col_double(),
## Temperature = col_double(),
## Dewpoint = col_double(),
## RH = col_double(),
## WindDir = col_logical(),
## Windspeed = col_double(),
## CldFrac = col_double(),
## MSLP = col_logical(),
## Weather = col_character(),
## Precip = col_logical(),
## Source = col_character()
## )
## Warning: 435615 parsing failures.
## row col expected actual file
## 4296 Precip 1/0/T/F/TRUE/FALSE 0.00 'weather_data/KIAH.txt'
## 7237 Precip 1/0/T/F/TRUE/FALSE 0.00 'weather_data/KIAH.txt'
## 8392 Precip 1/0/T/F/TRUE/FALSE 0.00 'weather_data/KIAH.txt'
## 8760 Precip 1/0/T/F/TRUE/FALSE 0.00 'weather_data/KIAH.txt'
## 8761 Precip 1/0/T/F/TRUE/FALSE 1.0e-04 'weather_data/KIAH.txt'
## .... ...... .................. ....... .......................
## See problems(...) for more details.
chicago_weather_hourly <- read_csv("weather_data/KORD.txt") %>%
mutate(date = as.Date(Date, format = '%m/%d/%Y')) %>%
separate(date, c("year","month","day")) %>%
janitor::clean_names()
## Parsed with column specification:
## cols(
## Site = col_character(),
## Date = col_character(),
## Hour = col_double(),
## Temperature = col_double(),
## Dewpoint = col_double(),
## RH = col_double(),
## WindDir = col_logical(),
## Windspeed = col_double(),
## CldFrac = col_double(),
## MSLP = col_logical(),
## Weather = col_character(),
## Precip = col_logical(),
## Source = col_character()
## )
## Warning: 435438 parsing failures.
## row col expected actual file
## 5556 Precip 1/0/T/F/TRUE/FALSE 0.00 'weather_data/KORD.txt'
## 8760 Precip 1/0/T/F/TRUE/FALSE 0.00 'weather_data/KORD.txt'
## 8761 Precip 1/0/T/F/TRUE/FALSE 0.00 'weather_data/KORD.txt'
## 8762 Precip 1/0/T/F/TRUE/FALSE 0.00 'weather_data/KORD.txt'
## 8763 Precip 1/0/T/F/TRUE/FALSE 1.0e-04 'weather_data/KORD.txt'
## .... ...... .................. ....... .......................
## See problems(...) for more details.
seattle_weather_hourly <- read_csv("weather_data/KSEA.txt") %>%
mutate(date=as.Date(Date, format = '%m/%d/%Y')) %>%
separate(date, c("year", "month", "day")) %>%
janitor::clean_names()
## Parsed with column specification:
## cols(
## Site = col_character(),
## Date = col_character(),
## Hour = col_double(),
## Temperature = col_double(),
## Dewpoint = col_double(),
## RH = col_double(),
## WindDir = col_logical(),
## Windspeed = col_double(),
## CldFrac = col_double(),
## MSLP = col_logical(),
## Weather = col_character(),
## Precip = col_double(),
## Source = col_character()
## )
## Warning: 261428 parsing failures.
## row col expected actual file
## 52584 WindDir 1/0/T/F/TRUE/FALSE 120 'weather_data/KSEA.txt'
## 52584 MSLP 1/0/T/F/TRUE/FALSE 29.74 'weather_data/KSEA.txt'
## 52585 WindDir 1/0/T/F/TRUE/FALSE 120 'weather_data/KSEA.txt'
## 52585 MSLP 1/0/T/F/TRUE/FALSE 29.73 'weather_data/KSEA.txt'
## 52586 WindDir 1/0/T/F/TRUE/FALSE 000 'weather_data/KSEA.txt'
## ..... ....... .................. ...... .......................
## See problems(...) for more details.
houston_weather_hourly %>%
bind_rows(., chicago_weather_hourly,seattle_weather_hourly ) %>%
mutate(date=as.Date(date,format='%m/%d/%Y')) %>%
separate(date,c("year","month","day")) %>%
group_by(site, year) %>%
summarise(mean_cloud_coverage = mean(`cld_frac`)) %>%
ungroup() %>%
ggplot(.,aes(x = as.numeric(year), y = mean_cloud_coverage, color = site)) +
geom_line() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
scale_x_continuous(breaks = scales::pretty_breaks(n = 10))
houston_weather_hourly %>%
bind_rows(., chicago_weather_hourly,seattle_weather_hourly ) %>%
mutate(date=as.Date(date,format='%m/%d/%Y')) %>%
separate(date,c("year","month","day")) %>%
group_by(site, month) %>%
summarise(sum_cloud_coverage = sum(`cld_frac`),avg_cloud_coverage = mean(`cld_frac`)) %>%
ungroup() %>%
ggplot(.,aes(x = as.numeric(month), y = sum_cloud_coverage, color = site)) +
geom_line(linetype = "dotdash") +
geom_point(aes(inherit.aes=TRUE, size = avg_cloud_coverage)) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
scale_x_continuous(breaks = scales::pretty_breaks(n = 10))
## Warning: Ignoring unknown aesthetics: inherit.aes
houston_weather_hourly %>%
bind_rows(., chicago_weather_hourly,seattle_weather_hourly ) %>%
mutate(date=as.Date(date,format='%m/%d/%Y')) %>%
separate(date,c("year","month","day")) %>%
group_by(site, year, month, day) %>%
summarise(avg_cloud_coverage = mean(`cld_frac`)) %>%
ungroup() %>%
group_by(site, year) %>%
summarise(no_cloudy_days = sum(avg_cloud_coverage < .30)) %>%
ggplot(.,aes(x = as.numeric(year), y = no_cloudy_days, color = site)) +
geom_line(linetype=2) +
geom_point(aes(inherit.aes=TRUE)) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) +
ylab("Number of days with average cloud cover < 30%") +
xlab("Year")
## Warning: Ignoring unknown aesthetics: inherit.aes
houston_weather_hourly %>%
bind_rows(., chicago_weather_hourly,seattle_weather_hourly) %>%
mutate(date=as.Date(date,format = '%m/%d/%Y')) %>%
separate(date,c("year","month","day")) %>%
group_by(site, year) %>%
summarise(mean_windspeed = mean(`windspeed`)) %>%
ungroup() %>%
ggplot(.,aes(x = as.numeric(year), y = mean_windspeed, color = site)) +
geom_line(size = 2) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
scale_x_continuous(breaks = scales::pretty_breaks(n = 10))