library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0 ✔ purrr 0.3.0
## ✔ tibble 2.0.1 ✔ dplyr 0.8.0
## ✔ tidyr 0.8.2 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.3.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(janitor)
library(xml2)
library(rvest)
##
## Attaching package: 'rvest'
## The following object is masked from 'package:purrr':
##
## pluck
## The following object is masked from 'package:readr':
##
## guess_encoding
Data downloaded from: https://www.fda.gov/aboutfda/transparency/opengovernment/ucm225433.htm
part_2009_2017_xml <- rbind(
XML::xmlToDataFrame("FDA_recall_data/RecallsDataSet2009-2011.xml"),
XML::xmlToDataFrame("FDA_recall_data/RecallsDataSet2012-2014.xml"),
XML::xmlToDataFrame("FDA_recall_data/RecallsDataSet2015-2017.xml")
)
part_2009_2017_df <- part_2009_2017_xml %>%
janitor::clean_names() %>%
select(-photos_link, -company_release_link, -text) %>%
mutate(date = str_sub(date, 5,16)) %>%
mutate(date = lubridate::dmy(date)) %>%
select(date, brand_name, product_description, reason, company)
Data for 2018 is available separately and is not in standard XML.
pg <- read_html("FDA_recall_data/recalls2018.xml")
date <- head(html_nodes(pg, "date"), "recall") %>%
html_text()
brand_name <- head(html_nodes(pg, "brand"), "recall") %>%
html_text()
product_description <- head(html_nodes(pg, "productdescription"), "recall") %>%
html_text()
reason <- head(html_nodes(pg, "reason"), "recall") %>%
html_text()
company <- head(html_nodes(pg, "company"), "recall") %>%
html_text()
part_2018_df <- data.frame(date,brand_name,product_description,reason,company) %>%
mutate(date = lubridate::mdy(date)) %>%
mutate_if(is.factor,as.character)
Combine data from 2018 with rest of the years starting from 2009
complete_df_1 <- rbind(part_2009_2017_df, part_2018_df)
Inspect all the columns in the dataframe
str(complete_df_1)
## 'data.frame': 4222 obs. of 5 variables:
## $ date : Date, format: "2011-12-31" "2011-12-31" ...
## $ brand_name : chr "Let's Grow Healthy Together!, Green Valley Food Corp., Broccosprouts" "Edelweiss Creamery" "Brennan's Cellars" "Martinelli's Gold Medal" ...
## $ product_description: chr "Sprouts" "cheese" "Cheese" "Sparkling Cider" ...
## $ reason : chr "Listeria monocytogenes" "Listeria monocytogenes" "Listeria monocytogenes" "Defective Seal" ...
## $ company : chr "Green Valley Food Corp." "Alpine Slicing and Cheese Conversion" "Alpine Slicing and Cheese Conversion" "S. Martinelli & Company" ...
complete_df <- complete_df_1 %>%
separate(date,c("Year","Month","Day"))
str(complete_df)
## 'data.frame': 4222 obs. of 7 variables:
## $ Year : chr "2011" "2011" "2011" "2011" ...
## $ Month : chr "12" "12" "12" "12" ...
## $ Day : chr "31" "31" "31" "30" ...
## $ brand_name : chr "Let's Grow Healthy Together!, Green Valley Food Corp., Broccosprouts" "Edelweiss Creamery" "Brennan's Cellars" "Martinelli's Gold Medal" ...
## $ product_description: chr "Sprouts" "cheese" "Cheese" "Sparkling Cider" ...
## $ reason : chr "Listeria monocytogenes" "Listeria monocytogenes" "Listeria monocytogenes" "Defective Seal" ...
## $ company : chr "Green Valley Food Corp." "Alpine Slicing and Cheese Conversion" "Alpine Slicing and Cheese Conversion" "S. Martinelli & Company" ...
complete_df %>%
group_by(product_description) %>%
summarize(c = n()) %>%
arrange(desc(c)) %>%
filter()
Since Ice cream is the number one product for the recall
complete_df %>%
filter(product_description == "Ice Cream") %>%
group_by(reason) %>%
summarize(c = n()) %>%
arrange(desc(c)) %>%
filter()
complete_df %>%
filter(product_description == "Ice Cream") %>%
group_by(brand_name) %>%
summarize(c = n()) %>%
arrange(desc(c)) %>%
filter()
complete_df %>%
group_by(reason) %>%
summarize(c = n()) %>%
arrange(desc(c)) %>%
filter()
We see that Salmonella is the number one reason over the past 10 years, but look at the reason column more carefully. There are so many entries such as “presence of salmonella” which tells us that the count is more than 1180.
We will convert all the entries into lowercase so that we can find the word “salmonella” anywhere in the reson column
complete_df %>%
mutate(reason=tolower(reason)) %>%
filter(grepl("salmonella",reason)) %>%
summarize(n=n())
The count value for Salmonella rose to 1255 to 1180.
Now we want to find what products are associated with the Salmonella contamination.
complete_df %>%
mutate(reason=tolower(reason)) %>%
filter(grepl("salmonella",reason)) %>%
group_by(reason, product_description) %>%
summarize(c = n()) %>%
arrange(desc(c))
Pistachios are the number one product with Salmonella contamination.
complete_df %>%
mutate(reason=tolower(reason)) %>%
mutate(product_description=tolower(product_description)) %>%
filter(grepl("salmonella",reason)) %>%
filter(grepl("pistachio",product_description)) %>%
group_by(brand_name, Year) %>%
summarize(c = n()) %>%
arrange(desc(c))
Is the count correct? What can be improved?
complete_df %>%
group_by(company) %>%
summarize(c = n()) %>%
arrange(desc(c))
complete_df %>%
group_by(Year) %>%
summarize(c = n()) %>%
arrange(desc(Year)) %>%
filter()
complete_df %>%
group_by(Month) %>%
summarize(c = n()) %>%
arrange(desc(c))
February has the highest recalls over the last 10 year