Load library

library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0     ✔ purrr   0.3.0
## ✔ tibble  2.0.1     ✔ dplyr   0.8.0
## ✔ tidyr   0.8.2     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.3.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(janitor)

Hourly Weather data downloaded from http://www.frontierweather.com/historicaldatasubscribers_hourly.html

Daily Weather data downloaded from http://www.frontierweather.com/weatherdatastore.html

Daily weather data

Read the Houston daily weather data

houston_weather <- read_delim("weather_data/KIAH_daily.txt", delim = ",")
## Parsed with column specification:
## cols(
##   Site4 = col_character(),
##   Date = col_character(),
##   Source = col_character(),
##   `Max Temp` = col_double(),
##   `Min Temp` = col_double(),
##   `Avg Temp` = col_double(),
##   HDDs = col_double(),
##   CDDs = col_double(),
##   `Precipitation Water Equiv` = col_double(),
##   Snowfall = col_double(),
##   `Snow/Ice Depth` = col_double()
## )
  • Find out what these columns mean.
    • Site4 : Name of the site

    • Date : Date when the data was collected

    • Source : Name of their instrument

    • Max Temp : Maximum temperature of the day

    • Min Temp : Minimum temperature of the day

    • Avg Temp : Average temperature (Max + Min)/2

    • HDDs : Heating degree day (Temps below 65) https://w1.weather.gov/glossary/index.php?letter=h

    • CDDs : Cooling degree day (Temps above 65) - A form of degree day used to estimate energy requirements for air conditioning or refrigeration. For example, if a location experiences a mean temperature of 75°F on a certain day, there were 10 CDD (Cooling Degree Days) that day because 75 - 65 = 10.) https://w1.weather.gov/glossary/index.php?letter=c

    • Precipitation Water Equiv : Rain in inches

    • Snowfall : Amount of snowfall since 24 hours (https://www.weather.gov/gsp/snow)

    • Snow/Ice Depth : The depth of the new and old snow remaining on the ground at observation time (https://www.weather.gov/gsp/snow)

Find out more about this function

??read_delim

“Format” the data

houston_weather_df <- houston_weather %>%
  mutate(Date=as.Date(Date,format='%m/%d/%Y')) %>%
  separate(Date,c("year","month","day")) %>%
  janitor::clean_names()

Read Chicago daily weather data

chicago_weather <- read_csv("weather_data/KORD_daily.txt") 
## Parsed with column specification:
## cols(
##   Site4 = col_character(),
##   Date = col_character(),
##   Source = col_character(),
##   `Max Temp` = col_double(),
##   `Min Temp` = col_double(),
##   `Avg Temp` = col_double(),
##   HDDs = col_double(),
##   CDDs = col_double(),
##   `Precipitation Water Equiv` = col_double(),
##   Snowfall = col_double(),
##   `Snow/Ice Depth` = col_double()
## )

Notice we have used different function but without explicitly mentioning delimiter.

“Format” Chicago weather

chicago_weather_df <- chicago_weather %>%
  mutate(Date=as.Date(Date,format='%m/%d/%Y')) %>%
  separate(Date,c("year","month","day")) %>%
  janitor::clean_names()

Get Seattle weather data and format

seattle_weather_df <- read_csv("weather_data/KSEA_daily.txt")  %>%
  mutate(Date = as.Date(Date, format='%m/%d/%Y')) %>%
  separate(Date, c("year","month","day")) %>%
  janitor::clean_names()
## Parsed with column specification:
## cols(
##   Site4 = col_character(),
##   Date = col_character(),
##   Source = col_character(),
##   `Max Temp` = col_double(),
##   `Min Temp` = col_double(),
##   `Avg Temp` = col_double(),
##   HDDs = col_double(),
##   CDDs = col_double(),
##   `Precipitation Water Equiv` = col_double(),
##   Snowfall = col_double(),
##   `Snow/Ice Depth` = col_double()
## )

Comparing mean temperature between Houston and Chicago

houston_weather_df %>%
  bind_rows(., chicago_weather_df, seattle_weather_df) %>%
  group_by(site4, year) %>%
  summarise(mean_temp_year=mean(avg_temp)) %>%
  ungroup() %>%
  ggplot(.,aes(x = as.numeric(year), y = mean_temp_year, color = site4)) +
  geom_line(lwd = 2) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) +
  theme_bw()
## Warning: Removed 1 rows containing missing values (geom_path).

Fix the x axis label and the legend??

Plot Histogram

houston_weather_df %>%
  bind_rows(., chicago_weather_df, seattle_weather_df) %>%
  ggplot(., aes(x=avg_temp, fill=site4, color=site4)) +
  geom_histogram(position="identity",  alpha = 0.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 16 rows containing non-finite values (stat_bin).

  geom_histogram(position="identity")
## geom_bar: na.rm = FALSE
## stat_bin: binwidth = NULL, bins = NULL, na.rm = FALSE, pad = FALSE
## position_identity

Plot Empirical Cumulative Distribution Function

houston_weather_df %>%
  bind_rows(., chicago_weather_df, seattle_weather_df) %>%
  ggplot(.,aes(avg_temp, color = site4)) +
  stat_ecdf(geom = "point") 
## Warning: Removed 16 rows containing non-finite values (stat_ecdf).

A CDF is a function such as y=f(x) where y is the probability of the number x, or any lower number, being chosen at random from that distribution.

source: https://www.che.utah.edu/~tony/course/material/Statistics/18_rv_pdf_cdf.php

Here we are talking about ECDF “Empirical” CDF. This means this is not a representation of the entire set (or theoretical distribution) but what was observed and hence the word “empirical”.

Stem and leaf plot

stem(houston_weather_df$max_temp)
## 
##   The decimal point is 1 digit(s) to the right of the |
## 
##    2 | 788889
##    3 | 0011112223333333344444444
##    3 | 5555666666666677777777777777888888888888999999999999999
##    4 | 00000000000000000001111111111111111111111222222222222222222222222222+74
##    4 | 55555555555555555555555555555555555555555555555555555556666666666666+210
##    5 | 00000000000000000000000000000000000000000000000000000000000000000000+395
##    5 | 55555555555555555555555555555555555555555555555555555555555555555555+632
##    6 | 00000000000000000000000000000000000000000000000000000000000000000000+850
##    6 | 55555555555555555555555555555555555555555555555555555555555555555555+1193
##    7 | 00000000000000000000000000000000000000000000000000000000000000000000+1615
##    7 | 55555555555555555555555555555555555555555555555555555555555555555555+2158
##    8 | 00000000000000000000000000000000000000000000000000000000000000000000+2383
##    8 | 55555555555555555555555555555555555555555555555555555555555555555555+2507
##    9 | 00000000000000000000000000000000000000000000000000000000000000000000+3016
##    9 | 55555555555555555555555555555555555555555555555555555555555555555555+1743
##   10 | 00000000000000000000000000000000000000000000000000000000000000000000+167
##   10 | 55555566777777899

It can be informative but compared to histogram or ecdf plot, it can be hard to read.

See the spread of the temperature by month

houston_weather_df %>%
  bind_rows(chicago_weather_df, seattle_weather_df) %>%
  group_by(site4,month) %>%
  summarise(max = max(max_temp,na.rm = T), 
            min = min(min_temp, na.rm = T), 
            mean = mean(avg_temp, na.rm = T)) %>%
  ungroup() %>%
  ggplot(., aes(x = month, y = mean)) +
  geom_linerange(aes( ymin = min, ymax = max), lwd = 2, color = "#FF5400") +
  geom_point(size = 3, color = "#2142A6") +
  coord_flip() +
  theme_bw(base_size = 14) +
  geom_hline(yintercept = 65, linetype = "dashed") +
  xlab("Month") +
  ylab("Temperature in Farahenit") +
  facet_grid(.~site4)

Percent drop as compare to previous month.

houston_weather_df %>%
  bind_rows(chicago_weather_df) %>%
  bind_rows(seattle_weather_df) %>%
  group_by(site4, month) %>%
  summarise(mean_avg_temp = mean(avg_temp,na.rm = T)) %>%
  mutate(pct_change = (mean_avg_temp / lag(mean_avg_temp) - 1) * 100) %>%
  mutate(pct_change = case_when(
    is.na(pct_change) ~ 0,
    TRUE ~ as.numeric(pct_change)
  )) %>%
  arrange(pct_change) %>% 
  ggplot(.,aes(x = month, y = pct_change, color = site4)) +
  geom_line(aes(group = site4), size = 1)+
  geom_point() +
  theme_bw()

Where does it rain the most?

houston_weather_df %>%
  bind_rows(chicago_weather_df, seattle_weather_df) %>%
  group_by(site4,year) %>%
  summarise(sum = sum(precipitation_water_equiv, na.rm = T)) %>%
  ungroup() %>%
  ggplot(., aes(x = year, y = sum, color = site4) ) +
  geom_point() +
  geom_line(aes(group = site4), size = 1)+
  theme_bw(base_size = 14) +
  theme(axis.text.x = element_text(angle = 65, hjust = 1)) +
  ylab("Rainfall in inches") 

This graph shows total rainfall in inches over the years.

Average rainfall with the circles showing the maximum recorded rainfall for single day

houston_weather_df %>%
  bind_rows(chicago_weather_df, seattle_weather_df) %>%
  group_by(site4,month) %>%
  summarise(max = max(precipitation_water_equiv,na.rm = T), 
            mean = mean(precipitation_water_equiv, na.rm = T)) %>%
  ungroup() %>%
  ggplot(. ) +
  geom_point(aes(x = month, y = mean,size = max), color = "#2142A6") +
  coord_flip() +
  theme_bw(base_size = 14) +
  xlab("Month") +
  ylab("Average rain fall in inches") +
  facet_wrap(.~site4)

Hourly weather data

Read Houston hourly weather data

houston_weather_hourly <- read_csv("weather_data/KIAH.txt") %>%
  mutate(date=as.Date(Date,format='%m/%d/%Y')) %>%
  separate(date,c("year","month","day")) %>%
  janitor::clean_names()
## Parsed with column specification:
## cols(
##   Site = col_character(),
##   Date = col_character(),
##   Hour = col_double(),
##   Temperature = col_double(),
##   Dewpoint = col_double(),
##   RH = col_double(),
##   WindDir = col_logical(),
##   Windspeed = col_double(),
##   CldFrac = col_double(),
##   MSLP = col_logical(),
##   Weather = col_character(),
##   Precip = col_logical(),
##   Source = col_character()
## )
## Warning: 435615 parsing failures.
##  row    col           expected  actual                    file
## 4296 Precip 1/0/T/F/TRUE/FALSE 0.00    'weather_data/KIAH.txt'
## 7237 Precip 1/0/T/F/TRUE/FALSE 0.00    'weather_data/KIAH.txt'
## 8392 Precip 1/0/T/F/TRUE/FALSE 0.00    'weather_data/KIAH.txt'
## 8760 Precip 1/0/T/F/TRUE/FALSE 0.00    'weather_data/KIAH.txt'
## 8761 Precip 1/0/T/F/TRUE/FALSE 1.0e-04 'weather_data/KIAH.txt'
## .... ...... .................. ....... .......................
## See problems(...) for more details.

Read Chicago Hourly weather data

chicago_weather_hourly <- read_csv("weather_data/KORD.txt") %>%
  mutate(date = as.Date(Date, format = '%m/%d/%Y')) %>%
  separate(date, c("year","month","day")) %>%
  janitor::clean_names()
## Parsed with column specification:
## cols(
##   Site = col_character(),
##   Date = col_character(),
##   Hour = col_double(),
##   Temperature = col_double(),
##   Dewpoint = col_double(),
##   RH = col_double(),
##   WindDir = col_logical(),
##   Windspeed = col_double(),
##   CldFrac = col_double(),
##   MSLP = col_logical(),
##   Weather = col_character(),
##   Precip = col_logical(),
##   Source = col_character()
## )
## Warning: 435438 parsing failures.
##  row    col           expected  actual                    file
## 5556 Precip 1/0/T/F/TRUE/FALSE 0.00    'weather_data/KORD.txt'
## 8760 Precip 1/0/T/F/TRUE/FALSE 0.00    'weather_data/KORD.txt'
## 8761 Precip 1/0/T/F/TRUE/FALSE 0.00    'weather_data/KORD.txt'
## 8762 Precip 1/0/T/F/TRUE/FALSE 0.00    'weather_data/KORD.txt'
## 8763 Precip 1/0/T/F/TRUE/FALSE 1.0e-04 'weather_data/KORD.txt'
## .... ...... .................. ....... .......................
## See problems(...) for more details.

Add Seattle hourly weather data

seattle_weather_hourly <- read_csv("weather_data/KSEA.txt") %>%
  mutate(date=as.Date(Date, format = '%m/%d/%Y')) %>%
  separate(date, c("year", "month", "day")) %>%
  janitor::clean_names()
## Parsed with column specification:
## cols(
##   Site = col_character(),
##   Date = col_character(),
##   Hour = col_double(),
##   Temperature = col_double(),
##   Dewpoint = col_double(),
##   RH = col_double(),
##   WindDir = col_logical(),
##   Windspeed = col_double(),
##   CldFrac = col_double(),
##   MSLP = col_logical(),
##   Weather = col_character(),
##   Precip = col_double(),
##   Source = col_character()
## )
## Warning: 261428 parsing failures.
##   row     col           expected actual                    file
## 52584 WindDir 1/0/T/F/TRUE/FALSE  120   'weather_data/KSEA.txt'
## 52584 MSLP    1/0/T/F/TRUE/FALSE  29.74 'weather_data/KSEA.txt'
## 52585 WindDir 1/0/T/F/TRUE/FALSE  120   'weather_data/KSEA.txt'
## 52585 MSLP    1/0/T/F/TRUE/FALSE  29.73 'weather_data/KSEA.txt'
## 52586 WindDir 1/0/T/F/TRUE/FALSE  000   'weather_data/KSEA.txt'
## ..... ....... .................. ...... .......................
## See problems(...) for more details.

Is Seattle really cloudy as compare to Chicago and Houston?

houston_weather_hourly %>%
  bind_rows(., chicago_weather_hourly,seattle_weather_hourly ) %>%
  mutate(date=as.Date(date,format='%m/%d/%Y')) %>%
  separate(date,c("year","month","day")) %>%
  group_by(site, year) %>%
  summarise(mean_cloud_coverage = mean(`cld_frac`)) %>%
  ungroup() %>% 
  ggplot(.,aes(x = as.numeric(year), y = mean_cloud_coverage, color = site)) +
  geom_line() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) 

houston_weather_hourly %>%
  bind_rows(., chicago_weather_hourly,seattle_weather_hourly ) %>%
  mutate(date=as.Date(date,format='%m/%d/%Y')) %>%
  separate(date,c("year","month","day")) %>%
  group_by(site, month) %>%
  summarise(sum_cloud_coverage = sum(`cld_frac`),avg_cloud_coverage = mean(`cld_frac`)) %>%
  ungroup() %>% 
  ggplot(.,aes(x = as.numeric(month), y = sum_cloud_coverage, color = site)) +
  geom_line(linetype = "dotdash") +
  geom_point(aes(inherit.aes=TRUE, size = avg_cloud_coverage)) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) 
## Warning: Ignoring unknown aesthetics: inherit.aes

Days with less than 30% average cloud coverage

houston_weather_hourly %>%
  bind_rows(., chicago_weather_hourly,seattle_weather_hourly ) %>%
  mutate(date=as.Date(date,format='%m/%d/%Y')) %>%
  separate(date,c("year","month","day")) %>%
  group_by(site, year, month, day) %>%
  summarise(avg_cloud_coverage = mean(`cld_frac`)) %>%
  ungroup() %>% 
  group_by(site, year) %>%
  summarise(no_cloudy_days = sum(avg_cloud_coverage < .30)) %>%
  ggplot(.,aes(x = as.numeric(year), y = no_cloudy_days, color = site)) +
  geom_line(linetype=2) +
  geom_point(aes(inherit.aes=TRUE)) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) +
  ylab("Number of days with average cloud cover < 30%") +
  xlab("Year")
## Warning: Ignoring unknown aesthetics: inherit.aes

Is Chicago the windiest city in our set?

houston_weather_hourly %>%
  bind_rows(., chicago_weather_hourly,seattle_weather_hourly) %>%
  mutate(date=as.Date(date,format = '%m/%d/%Y')) %>%
  separate(date,c("year","month","day")) %>%
  group_by(site, year) %>%
  summarise(mean_windspeed = mean(`windspeed`)) %>%
  ungroup() %>% 
  ggplot(.,aes(x = as.numeric(year), y = mean_windspeed, color = site)) +
  geom_line(size = 2) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  scale_x_continuous(breaks = scales::pretty_breaks(n = 10))