library(tidyverse)
[30m── [1mAttaching packages[22m ──────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──[39m
[30m[32m✔[30m [34mggplot2[30m 3.1.0 [32m✔[30m [34mpurrr [30m 0.3.0
[32m✔[30m [34mtibble [30m 2.0.1 [32m✔[30m [34mdplyr [30m 0.8.0
[32m✔[30m [34mtidyr [30m 0.8.2 [32m✔[30m [34mstringr[30m 1.4.0
[32m✔[30m [34mreadr [30m 1.3.1 [32m✔[30m [34mforcats[30m 0.3.0[39m
[30m── [1mConflicts[22m ─────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[30m [34mdplyr[30m::[32mfilter()[30m masks [34mstats[30m::filter()
[31m✖[30m [34mdplyr[30m::[32mlag()[30m masks [34mstats[30m::lag()[39m
library(janitor)
Hourly Weather data downloaded from http://www.frontierweather.com/historicaldatasubscribers_hourly.html
Daily Weather data downloaded from http://www.frontierweather.com/weatherdatastore.html
houston_weather <- read_delim("weather_data/KIAH_daily.txt", delim = ",")
Parsed with column specification:
cols(
Site4 = [31mcol_character()[39m,
Date = [31mcol_character()[39m,
Source = [31mcol_character()[39m,
`Max Temp` = [32mcol_double()[39m,
`Min Temp` = [32mcol_double()[39m,
`Avg Temp` = [32mcol_double()[39m,
HDDs = [32mcol_double()[39m,
CDDs = [32mcol_double()[39m,
`Precipitation Water Equiv` = [32mcol_double()[39m,
Snowfall = [32mcol_double()[39m,
`Snow/Ice Depth` = [32mcol_double()[39m
)
Site4 : Name of the site
Date : Date when the data was collected
Source : Name of their instrument
Max Temp : Maximum temperature of the day
Min Temp : Minimum temperature of the day
Avg Temp : Average temperature (Max + Min)/2
HDDs : Heating degree day (Temps below 65) https://w1.weather.gov/glossary/index.php?letter=h
CDDs : Cooling degree day (Temps above 65) - A form of degree day used to estimate energy requirements for air conditioning or refrigeration. For example, if a location experiences a mean temperature of 75°F on a certain day, there were 10 CDD (Cooling Degree Days) that day because 75 - 65 = 10.) https://w1.weather.gov/glossary/index.php?letter=c
Precipitation Water Equiv : Rain in inches
Snowfall : Amount of snowfall since 24 hours (https://www.weather.gov/gsp/snow)
Snow/Ice Depth : The depth of the new and old snow remaining on the ground at observation time (https://www.weather.gov/gsp/snow)
??read_delim
houston_weather_df <- houston_weather %>%
mutate(Date=as.Date(Date,format='%m/%d/%Y')) %>%
separate(Date,c("year","month","day")) %>%
janitor::clean_names()
chicago_weather <- read_csv("weather_data/KORD_daily.txt")
Parsed with column specification:
cols(
Site4 = [31mcol_character()[39m,
Date = [31mcol_character()[39m,
Source = [31mcol_character()[39m,
`Max Temp` = [32mcol_double()[39m,
`Min Temp` = [32mcol_double()[39m,
`Avg Temp` = [32mcol_double()[39m,
HDDs = [32mcol_double()[39m,
CDDs = [32mcol_double()[39m,
`Precipitation Water Equiv` = [32mcol_double()[39m,
Snowfall = [32mcol_double()[39m,
`Snow/Ice Depth` = [32mcol_double()[39m
)
Notice we have used different function but without explicitly mentioning delimiter.
chicago_weather_df <- chicago_weather %>%
mutate(Date=as.Date(Date,format='%m/%d/%Y')) %>%
separate(Date,c("year","month","day")) %>%
janitor::clean_names()
seattle_weather_df <- read_csv("weather_data/KSEA_daily.txt") %>%
mutate(Date = as.Date(Date, format='%m/%d/%Y')) %>%
separate(Date, c("year","month","day")) %>%
janitor::clean_names()
Parsed with column specification:
cols(
Site4 = [31mcol_character()[39m,
Date = [31mcol_character()[39m,
Source = [31mcol_character()[39m,
`Max Temp` = [32mcol_double()[39m,
`Min Temp` = [32mcol_double()[39m,
`Avg Temp` = [32mcol_double()[39m,
HDDs = [32mcol_double()[39m,
CDDs = [32mcol_double()[39m,
`Precipitation Water Equiv` = [32mcol_double()[39m,
Snowfall = [32mcol_double()[39m,
`Snow/Ice Depth` = [32mcol_double()[39m
)
houston_weather_df %>%
bind_rows(., chicago_weather_df, seattle_weather_df) %>%
group_by(site4, year) %>%
summarise(mean_temp_year=mean(avg_temp)) %>%
ungroup() %>%
ggplot(.,aes(x = as.numeric(year), y = mean_temp_year, color = site4)) +
geom_line(lwd = 2) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) +
theme_bw()
Fix the x axis label and the legend??
houston_weather_df %>%
bind_rows(., chicago_weather_df, seattle_weather_df) %>%
ggplot(., aes(x=avg_temp, fill=site4, color=site4)) +
geom_histogram(position="identity", alpha = 0.5)
geom_histogram(position="identity")
geom_bar: na.rm = FALSE
stat_bin: binwidth = NULL, bins = NULL, na.rm = FALSE, pad = FALSE
position_identity
houston_weather_df %>%
bind_rows(., chicago_weather_df, seattle_weather_df) %>%
ggplot(.,aes(avg_temp, color = site4)) +
stat_ecdf(geom = "point")
A CDF is a function such as y=f(x) where y is the probability of the number x, or any lower number, being chosen at random from that distribution.
source: https://www.che.utah.edu/~tony/course/material/Statistics/18_rv_pdf_cdf.php
Here we are talking about ECDF “Empirical” CDF. This means this is not a representation of the entire set (or theoretical distribution) but what was observed and hence the word “empirical”.
stem(houston_weather_df$max_temp)
The decimal point is 1 digit(s) to the right of the |
2 | 788889
3 | 0011112223333333344444444
3 | 5555666666666677777777777777888888888888999999999999999
4 | 00000000000000000001111111111111111111111222222222222222222222222222+74
4 | 55555555555555555555555555555555555555555555555555555556666666666666+210
5 | 00000000000000000000000000000000000000000000000000000000000000000000+395
5 | 55555555555555555555555555555555555555555555555555555555555555555555+632
6 | 00000000000000000000000000000000000000000000000000000000000000000000+850
6 | 55555555555555555555555555555555555555555555555555555555555555555555+1193
7 | 00000000000000000000000000000000000000000000000000000000000000000000+1615
7 | 55555555555555555555555555555555555555555555555555555555555555555555+2158
8 | 00000000000000000000000000000000000000000000000000000000000000000000+2383
8 | 55555555555555555555555555555555555555555555555555555555555555555555+2507
9 | 00000000000000000000000000000000000000000000000000000000000000000000+3016
9 | 55555555555555555555555555555555555555555555555555555555555555555555+1743
10 | 00000000000000000000000000000000000000000000000000000000000000000000+167
10 | 55555566777777899
It can be informative but compared to histogram or ecdf plot, it can be hard to read.
houston_weather_df %>%
bind_rows(chicago_weather_df, seattle_weather_df) %>%
group_by(site4,month) %>%
summarise(max = max(max_temp,na.rm = T),
min = min(min_temp, na.rm = T),
mean = mean(avg_temp, na.rm = T)) %>%
ungroup() %>%
ggplot(., aes(x = month, y = mean)) +
geom_linerange(aes( ymin = min, ymax = max), lwd = 2, color = "#FF5400") +
geom_point(size = 3, color = "#2142A6") +
coord_flip() +
theme_bw(base_size = 14) +
geom_hline(yintercept = 65, linetype = "dashed") +
xlab("Month") +
ylab("Temperature in Farahenit") +
facet_grid(.~site4)
houston_weather_df %>%
bind_rows(chicago_weather_df) %>%
bind_rows(seattle_weather_df) %>%
group_by(site4, month) %>%
summarise(mean_avg_temp = mean(avg_temp,na.rm = T)) %>%
mutate(pct_change = (mean_avg_temp / lag(mean_avg_temp) - 1) * 100) %>%
mutate(pct_change = case_when(
is.na(pct_change) ~ 0,
TRUE ~ as.numeric(pct_change)
)) %>%
arrange(pct_change) %>%
ggplot(.,aes(x = month, y = pct_change, color = site4)) +
geom_line(aes(group = site4), size = 1)+
geom_point() +
theme_bw()
houston_weather_df %>%
bind_rows(chicago_weather_df, seattle_weather_df) %>%
group_by(site4,year) %>%
summarise(sum = sum(precipitation_water_equiv, na.rm = T)) %>%
ungroup() %>%
ggplot(., aes(x = year, y = sum, color = site4) ) +
geom_point() +
geom_line(aes(group = site4), size = 1)+
theme_bw(base_size = 14) +
theme(axis.text.x = element_text(angle = 65, hjust = 1)) +
ylab("Rainfall in inches")
This graph shows total rainfall in inches over the years.
houston_weather_df %>%
bind_rows(chicago_weather_df, seattle_weather_df) %>%
group_by(site4,month) %>%
summarise(max = max(precipitation_water_equiv,na.rm = T),
mean = mean(precipitation_water_equiv, na.rm = T)) %>%
ungroup() %>%
ggplot(. ) +
geom_point(aes(x = month, y = mean,size = max), color = "#2142A6") +
coord_flip() +
theme_bw(base_size = 14) +
xlab("Month") +
ylab("Average rain fall in inches") +
facet_wrap(.~site4)
houston_weather_hourly <- read_csv("weather_data/KIAH.txt") %>%
mutate(date=as.Date(Date,format='%m/%d/%Y')) %>%
separate(date,c("year","month","day")) %>%
janitor::clean_names()
Parsed with column specification:
cols(
Site = [31mcol_character()[39m,
Date = [31mcol_character()[39m,
Hour = [32mcol_double()[39m,
Temperature = [32mcol_double()[39m,
Dewpoint = [32mcol_double()[39m,
RH = [32mcol_double()[39m,
WindDir = [33mcol_logical()[39m,
Windspeed = [32mcol_double()[39m,
CldFrac = [32mcol_double()[39m,
MSLP = [33mcol_logical()[39m,
Weather = [31mcol_character()[39m,
Precip = [33mcol_logical()[39m,
Source = [31mcol_character()[39m
)
435615 parsing failures.
row col expected actual file
4296 Precip 1/0/T/F/TRUE/FALSE 0.00 'weather_data/KIAH.txt'
7237 Precip 1/0/T/F/TRUE/FALSE 0.00 'weather_data/KIAH.txt'
8392 Precip 1/0/T/F/TRUE/FALSE 0.00 'weather_data/KIAH.txt'
8760 Precip 1/0/T/F/TRUE/FALSE 0.00 'weather_data/KIAH.txt'
8761 Precip 1/0/T/F/TRUE/FALSE 1.0e-04 'weather_data/KIAH.txt'
.... ...... .................. ....... .......................
See problems(...) for more details.
chicago_weather_hourly <- read_csv("weather_data/KORD.txt") %>%
mutate(date = as.Date(Date, format = '%m/%d/%Y')) %>%
separate(date, c("year","month","day")) %>%
janitor::clean_names()
Parsed with column specification:
cols(
Site = [31mcol_character()[39m,
Date = [31mcol_character()[39m,
Hour = [32mcol_double()[39m,
Temperature = [32mcol_double()[39m,
Dewpoint = [32mcol_double()[39m,
RH = [32mcol_double()[39m,
WindDir = [33mcol_logical()[39m,
Windspeed = [32mcol_double()[39m,
CldFrac = [32mcol_double()[39m,
MSLP = [33mcol_logical()[39m,
Weather = [31mcol_character()[39m,
Precip = [33mcol_logical()[39m,
Source = [31mcol_character()[39m
)
435438 parsing failures.
row col expected actual file
5556 Precip 1/0/T/F/TRUE/FALSE 0.00 'weather_data/KORD.txt'
8760 Precip 1/0/T/F/TRUE/FALSE 0.00 'weather_data/KORD.txt'
8761 Precip 1/0/T/F/TRUE/FALSE 0.00 'weather_data/KORD.txt'
8762 Precip 1/0/T/F/TRUE/FALSE 0.00 'weather_data/KORD.txt'
8763 Precip 1/0/T/F/TRUE/FALSE 1.0e-04 'weather_data/KORD.txt'
.... ...... .................. ....... .......................
See problems(...) for more details.
seattle_weather_hourly <- read_csv("weather_data/KSEA.txt") %>%
mutate(date=as.Date(Date, format = '%m/%d/%Y')) %>%
separate(date, c("year", "month", "day")) %>%
janitor::clean_names()
Parsed with column specification:
cols(
Site = [31mcol_character()[39m,
Date = [31mcol_character()[39m,
Hour = [32mcol_double()[39m,
Temperature = [32mcol_double()[39m,
Dewpoint = [32mcol_double()[39m,
RH = [32mcol_double()[39m,
WindDir = [33mcol_logical()[39m,
Windspeed = [32mcol_double()[39m,
CldFrac = [32mcol_double()[39m,
MSLP = [33mcol_logical()[39m,
Weather = [31mcol_character()[39m,
Precip = [32mcol_double()[39m,
Source = [31mcol_character()[39m
)
261428 parsing failures.
row col expected actual file
52584 WindDir 1/0/T/F/TRUE/FALSE 120 'weather_data/KSEA.txt'
52584 MSLP 1/0/T/F/TRUE/FALSE 29.74 'weather_data/KSEA.txt'
52585 WindDir 1/0/T/F/TRUE/FALSE 120 'weather_data/KSEA.txt'
52585 MSLP 1/0/T/F/TRUE/FALSE 29.73 'weather_data/KSEA.txt'
52586 WindDir 1/0/T/F/TRUE/FALSE 000 'weather_data/KSEA.txt'
..... ....... .................. ...... .......................
See problems(...) for more details.
houston_weather_hourly %>%
bind_rows(., chicago_weather_hourly,seattle_weather_hourly ) %>%
mutate(date=as.Date(date,format='%m/%d/%Y')) %>%
separate(date,c("year","month","day")) %>%
group_by(site, year) %>%
summarise(mean_cloud_coverage = mean(`cld_frac`)) %>%
ungroup() %>%
ggplot(.,aes(x = as.numeric(year), y = mean_cloud_coverage, color = site)) +
geom_line() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
scale_x_continuous(breaks = scales::pretty_breaks(n = 10))
houston_weather_hourly %>%
bind_rows(., chicago_weather_hourly,seattle_weather_hourly ) %>%
mutate(date=as.Date(date,format='%m/%d/%Y')) %>%
separate(date,c("year","month","day")) %>%
group_by(site, month) %>%
summarise(sum_cloud_coverage = sum(`cld_frac`),avg_cloud_coverage = mean(`cld_frac`)) %>%
ungroup() %>%
ggplot(.,aes(x = as.numeric(month), y = sum_cloud_coverage, color = site)) +
geom_line(linetype = "dotdash") +
geom_point(aes(inherit.aes=TRUE, size = avg_cloud_coverage)) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
scale_x_continuous(breaks = scales::pretty_breaks(n = 10))
Ignoring unknown aesthetics: inherit.aes
houston_weather_hourly %>%
bind_rows(., chicago_weather_hourly,seattle_weather_hourly ) %>%
mutate(date=as.Date(date,format='%m/%d/%Y')) %>%
separate(date,c("year","month","day")) %>%
group_by(site, year, month, day) %>%
summarise(avg_cloud_coverage = mean(`cld_frac`)) %>%
ungroup() %>%
group_by(site, year) %>%
summarise(no_cloudy_days = sum(avg_cloud_coverage < .30)) %>%
ggplot(.,aes(x = as.numeric(year), y = no_cloudy_days, color = site)) +
geom_line(linetype=2) +
geom_point(aes(inherit.aes=TRUE)) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) +
ylab("Number of days with average cloud cover < 30%") +
xlab("Year")
Ignoring unknown aesthetics: inherit.aes
houston_weather_hourly %>%
bind_rows(., chicago_weather_hourly,seattle_weather_hourly) %>%
mutate(date=as.Date(date,format = '%m/%d/%Y')) %>%
separate(date,c("year","month","day")) %>%
group_by(site, year) %>%
summarise(mean_windspeed = mean(`windspeed`)) %>%
ungroup() %>%
ggplot(.,aes(x = as.numeric(year), y = mean_windspeed, color = site)) +
geom_line(size = 2) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
scale_x_continuous(breaks = scales::pretty_breaks(n = 10))