Load library
[30m── [1mAttaching packages[22m ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──[39m
[30m[32m✔[30m [34mggplot2[30m 3.1.0 [32m✔[30m [34mpurrr [30m 0.3.0
[32m✔[30m [34mtibble [30m 2.0.1 [32m✔[30m [34mdplyr [30m 0.8.0
[32m✔[30m [34mtidyr [30m 0.8.2 [32m✔[30m [34mstringr[30m 1.4.0
[32m✔[30m [34mreadr [30m 1.3.1 [32m✔[30m [34mforcats[30m 0.3.0[39m
[30m── [1mConflicts[22m ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[30m [34mdplyr[30m::[32mfilter()[30m masks [34mstats[30m::filter()
[31m✖[30m [34mdplyr[30m::[32mlag()[30m masks [34mstats[30m::lag()[39m
library(janitor)
library(xml2)
library(rvest)
Attaching package: ‘rvest’
The following object is masked from ‘package:purrr’:
pluck
The following object is masked from ‘package:readr’:
guess_encoding
Data downloaded from: https://www.fda.gov/aboutfda/transparency/opengovernment/ucm225433.htm
part_2009_2017_xml <- rbind(
XML::xmlToDataFrame("FDA_recall_data/RecallsDataSet2009-2011.xml"),
XML::xmlToDataFrame("FDA_recall_data/RecallsDataSet2012-2014.xml"),
XML::xmlToDataFrame("FDA_recall_data/RecallsDataSet2015-2017.xml")
)
part_2009_2017_df <- part_2009_2017_xml %>%
janitor::clean_names() %>%
select(-photos_link, -company_release_link, -text) %>%
mutate(date = str_sub(date, 5,16)) %>%
mutate(date = lubridate::dmy(date)) %>%
select(date, brand_name, product_description, reason, company)
Data for 2018 is available separately and is not in standard XML.
pg <- read_html("FDA_recall_data/recalls2018.xml")
date <- head(html_nodes(pg, "date"), "recall") %>%
html_text()
brand_name <- head(html_nodes(pg, "brand"), "recall") %>%
html_text()
product_description <- head(html_nodes(pg, "productdescription"), "recall") %>%
html_text()
reason <- head(html_nodes(pg, "reason"), "recall") %>%
html_text()
company <- head(html_nodes(pg, "company"), "recall") %>%
html_text()
part_2018_df <- data.frame(date,brand_name,product_description,reason,company) %>%
mutate(date = lubridate::mdy(date)) %>%
mutate_if(is.factor,as.character)
Combine data
Combine data from 2018 with rest of the years starting from 2009
complete_df_1 <- rbind(part_2009_2017_df, part_2018_df)
Inspect all the columns in the dataframe
'data.frame': 4222 obs. of 5 variables:
$ date : Date, format: "2011-12-31" "2011-12-31" "2011-12-31" "2011-12-30" ...
$ brand_name : chr "Let's Grow Healthy Together!, Green Valley Food Corp., Broccosprouts" "Edelweiss Creamery" "Brennan's Cellars" "Martinelli's Gold Medal" ...
$ product_description: chr "Sprouts" "cheese" "Cheese" "Sparkling Cider" ...
$ reason : chr "Listeria monocytogenes" "Listeria monocytogenes" "Listeria monocytogenes" "Defective Seal" ...
$ company : chr "Green Valley Food Corp." "Alpine Slicing and Cheese Conversion" "Alpine Slicing and Cheese Conversion" "S. Martinelli & Company" ...
Fix the date column by creating three new column as year, month and day
complete_df <- complete_df_1 %>%
separate(date,c("Year","Month","Day"))
Check the data again
'data.frame': 4222 obs. of 7 variables:
$ Year : chr "2011" "2011" "2011" "2011" ...
$ Month : chr "12" "12" "12" "12" ...
$ Day : chr "31" "31" "31" "30" ...
$ brand_name : chr "Let's Grow Healthy Together!, Green Valley Food Corp., Broccosprouts" "Edelweiss Creamery" "Brennan's Cellars" "Martinelli's Gold Medal" ...
$ product_description: chr "Sprouts" "cheese" "Cheese" "Sparkling Cider" ...
$ reason : chr "Listeria monocytogenes" "Listeria monocytogenes" "Listeria monocytogenes" "Defective Seal" ...
$ company : chr "Green Valley Food Corp." "Alpine Slicing and Cheese Conversion" "Alpine Slicing and Cheese Conversion" "S. Martinelli & Company" ...
Find the product with most recalls in last 10 year
complete_df %>%
group_by(product_description) %>%
summarize(c = n()) %>%
arrange(desc(c)) %>%
filter()
Find the top reasons for Ice cream recall.
Since Ice cream is the number one product for the recall
complete_df %>%
filter(product_description == "Ice Cream") %>%
group_by(reason) %>%
summarize(c = n()) %>%
arrange(desc(c)) %>%
filter()
Find the top brands with Ice cream recall
complete_df %>%
filter(product_description == "Ice Cream") %>%
group_by(brand_name) %>%
summarize(c = n()) %>%
arrange(desc(c)) %>%
filter()
No 1 recall reason for all items
complete_df %>%
group_by(reason) %>%
summarize(c = n()) %>%
arrange(desc(c)) %>%
filter()
We see that Salmonella is the number one reason over the past 10 years, but look at the reason column more carefully. There are so many entries such as “presence of salmonella” which tells us that the count is more than 1180.
Find all the entries where the word “Salmonella occurs”
We will convert all the entries into lowercase so that we can find the word “salmonella” anywhere in the reson column
complete_df %>%
mutate(reason=tolower(reason)) %>%
filter(grepl("salmonella",reason)) %>%
summarize(n=n())
The count value for Salmonella rose to 1255 to 1180.
Find all the entries where the word “Salmonella occurs”
Now we want to find what products are associated with the Salmonella contamination.
complete_df %>%
mutate(reason=tolower(reason)) %>%
filter(grepl("salmonella",reason)) %>%
group_by(reason, product_description) %>%
summarize(c = n()) %>%
arrange(desc(c))
Pistachios are the number one product with Salmonella contamination.
Find the brand that is associated with the Salmonella contamination and Pistachios
complete_df %>%
mutate(reason=tolower(reason)) %>%
mutate(product_description=tolower(product_description)) %>%
filter(grepl("salmonella",reason)) %>%
filter(grepl("pistachio",product_description)) %>%
group_by(brand_name, Year) %>%
summarize(c = n()) %>%
arrange(desc(c))
Is the count correct? What can be improved?
Find recall grouped by company
complete_df %>%
group_by(company) %>%
summarize(c = n()) %>%
arrange(desc(c))
Find all the recalls by year
complete_df %>%
group_by(Year) %>%
summarize(c = n()) %>%
arrange(desc(Year)) %>%
filter()
Find recalls grouped by month
complete_df %>%
group_by(Month) %>%
summarize(c = n()) %>%
arrange(desc(c))
February has the highest recalls over the last 10 year
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayA6IEFuYWx5emluZyBGREEgcmVjYWxsIGRhdGEgZnJvbSAyMDA5LTIwMTgiCm91dHB1dDogCiAgaHRtbF9ub3RlYm9vazoKICAgIGhpZ2hsaWdodDogaGFkZG9jawogICAgbWF0aGpheDogbnVsbAogICAgbnVtYmVyX3NlY3Rpb25zOiB5ZXMKICAgIHRoZW1lOiByZWFkYWJsZQogICAgdG9jOiB5ZXMKICAgIHRvY19mbG9hdDogeWVzCiAgaHRtbF9kb2N1bWVudDoKICAgIGRmX3ByaW50OiBwYWdlZAogICAgdG9jOiB5ZXMKZWRpdG9yX29wdGlvbnM6IAogIGNodW5rX291dHB1dF90eXBlOiBpbmxpbmUKLS0tCgoKIyBMb2FkIGxpYnJhcnkKYGBge3J9CmxpYnJhcnkodGlkeXZlcnNlKQpsaWJyYXJ5KGphbml0b3IpCmxpYnJhcnkoeG1sMikKbGlicmFyeShydmVzdCkKYGBgCgoKKipEYXRhIGRvd25sb2FkZWQgZnJvbTogaHR0cHM6Ly93d3cuZmRhLmdvdi9hYm91dGZkYS90cmFuc3BhcmVuY3kvb3BlbmdvdmVybm1lbnQvdWNtMjI1NDMzLmh0bSAqKgpgYGB7cn0KcGFydF8yMDA5XzIwMTdfeG1sIDwtIHJiaW5kKApYTUw6OnhtbFRvRGF0YUZyYW1lKCJGREFfcmVjYWxsX2RhdGEvUmVjYWxsc0RhdGFTZXQyMDA5LTIwMTEueG1sIiksClhNTDo6eG1sVG9EYXRhRnJhbWUoIkZEQV9yZWNhbGxfZGF0YS9SZWNhbGxzRGF0YVNldDIwMTItMjAxNC54bWwiKSwKWE1MOjp4bWxUb0RhdGFGcmFtZSgiRkRBX3JlY2FsbF9kYXRhL1JlY2FsbHNEYXRhU2V0MjAxNS0yMDE3LnhtbCIpCikgCgpwYXJ0XzIwMDlfMjAxN19kZiA8LSAgcGFydF8yMDA5XzIwMTdfeG1sICU+JQogIGphbml0b3I6OmNsZWFuX25hbWVzKCkgJT4lCiAgc2VsZWN0KC1waG90b3NfbGluaywgLWNvbXBhbnlfcmVsZWFzZV9saW5rLCAtdGV4dCkgJT4lCiAgbXV0YXRlKGRhdGUgPSBzdHJfc3ViKGRhdGUsIDUsMTYpKSAlPiUKICBtdXRhdGUoZGF0ZSA9IGx1YnJpZGF0ZTo6ZG15KGRhdGUpKSAlPiUKICBzZWxlY3QoZGF0ZSwgYnJhbmRfbmFtZSwgcHJvZHVjdF9kZXNjcmlwdGlvbiwgcmVhc29uLCBjb21wYW55KQpgYGAgICAgICAgICAgICAKCgpEYXRhIGZvciAyMDE4IGlzIGF2YWlsYWJsZSBzZXBhcmF0ZWx5IGFuZCBpcyBub3QgaW4gc3RhbmRhcmQgWE1MLiAKYGBge3J9ICAgICAgICAgICAgCnBnIDwtIHJlYWRfaHRtbCgiRkRBX3JlY2FsbF9kYXRhL3JlY2FsbHMyMDE4LnhtbCIpCmRhdGUgPC0gaGVhZChodG1sX25vZGVzKHBnLCAiZGF0ZSIpLCAicmVjYWxsIikgJT4lCiAgICAgICAgaHRtbF90ZXh0KCkKYnJhbmRfbmFtZSA8LSBoZWFkKGh0bWxfbm9kZXMocGcsICJicmFuZCIpLCAicmVjYWxsIikgJT4lCiAgICAgICAgaHRtbF90ZXh0KCkKcHJvZHVjdF9kZXNjcmlwdGlvbiA8LSBoZWFkKGh0bWxfbm9kZXMocGcsICJwcm9kdWN0ZGVzY3JpcHRpb24iKSwgInJlY2FsbCIpICU+JQogICAgICAgIGh0bWxfdGV4dCgpCnJlYXNvbiA8LSBoZWFkKGh0bWxfbm9kZXMocGcsICJyZWFzb24iKSwgInJlY2FsbCIpICU+JQogICAgICAgIGh0bWxfdGV4dCgpCmNvbXBhbnkgPC0gaGVhZChodG1sX25vZGVzKHBnLCAiY29tcGFueSIpLCAicmVjYWxsIikgJT4lCiAgICAgICAgaHRtbF90ZXh0KCkKCnBhcnRfMjAxOF9kZiA8LSBkYXRhLmZyYW1lKGRhdGUsYnJhbmRfbmFtZSxwcm9kdWN0X2Rlc2NyaXB0aW9uLHJlYXNvbixjb21wYW55KSAlPiUKICAgICAgICAgICAgIG11dGF0ZShkYXRlID0gbHVicmlkYXRlOjptZHkoZGF0ZSkpICU+JQogICAgICAgICAgICAgbXV0YXRlX2lmKGlzLmZhY3Rvcixhcy5jaGFyYWN0ZXIpCmBgYAoKCiMgQ29tYmluZSBkYXRhIApDb21iaW5lIGRhdGEgZnJvbSAyMDE4IHdpdGggcmVzdCBvZiB0aGUgeWVhcnMgc3RhcnRpbmcgZnJvbSAyMDA5CmBgYHtyfQpjb21wbGV0ZV9kZl8xIDwtIHJiaW5kKHBhcnRfMjAwOV8yMDE3X2RmLCBwYXJ0XzIwMThfZGYpCmBgYAoKSW5zcGVjdCBhbGwgdGhlIGNvbHVtbnMgaW4gdGhlIGRhdGFmcmFtZQpgYGB7cn0Kc3RyKGNvbXBsZXRlX2RmXzEpCmBgYAoKI0ZpeCB0aGUgZGF0ZSBjb2x1bW4gYnkgY3JlYXRpbmcgdGhyZWUgbmV3IGNvbHVtbiBhcyB5ZWFyLCBtb250aCBhbmQgZGF5CmBgYHtyfQpjb21wbGV0ZV9kZiA8LSBjb21wbGV0ZV9kZl8xICU+JQogIHNlcGFyYXRlKGRhdGUsYygiWWVhciIsIk1vbnRoIiwiRGF5IikpCmBgYAoKCiMgQ2hlY2sgdGhlIGRhdGEgYWdhaW4KYGBge3J9CnN0cihjb21wbGV0ZV9kZikKYGBgCgoKCgoKIyBGaW5kIHRoZSBwcm9kdWN0IHdpdGggbW9zdCByZWNhbGxzIGluIGxhc3QgMTAgeWVhcgpgYGB7cn0KY29tcGxldGVfZGYgJT4lCiAgZ3JvdXBfYnkocHJvZHVjdF9kZXNjcmlwdGlvbikgJT4lCiAgc3VtbWFyaXplKGMgPSBuKCkpICU+JQogIGFycmFuZ2UoZGVzYyhjKSkgJT4lCiAgZmlsdGVyKCkKYGBgCgoKCgojIEZpbmQgdGhlIHRvcCByZWFzb25zIGZvciBJY2UgY3JlYW0gcmVjYWxsLiAKU2luY2UgSWNlIGNyZWFtIGlzIHRoZSBudW1iZXIgb25lIHByb2R1Y3QgZm9yIHRoZSByZWNhbGwKYGBge3J9CmNvbXBsZXRlX2RmICU+JQogIGZpbHRlcihwcm9kdWN0X2Rlc2NyaXB0aW9uID09ICJJY2UgQ3JlYW0iKSAlPiUKICBncm91cF9ieShyZWFzb24pICU+JQogIHN1bW1hcml6ZShjID0gbigpKSAlPiUKICBhcnJhbmdlKGRlc2MoYykpICU+JQogIGZpbHRlcigpCmBgYAoKIyBGaW5kIHRoZSB0b3AgYnJhbmRzIHdpdGggSWNlIGNyZWFtIHJlY2FsbApgYGB7cn0KY29tcGxldGVfZGYgJT4lCiAgZmlsdGVyKHByb2R1Y3RfZGVzY3JpcHRpb24gPT0gIkljZSBDcmVhbSIpICU+JQogIGdyb3VwX2J5KGJyYW5kX25hbWUpICU+JQogIHN1bW1hcml6ZShjID0gbigpKSAlPiUKICBhcnJhbmdlKGRlc2MoYykpICU+JQogIGZpbHRlcigpCmBgYAoKCgoKIyBObyAxIHJlY2FsbCByZWFzb24gZm9yIGFsbCBpdGVtcyAKYGBge3J9CmNvbXBsZXRlX2RmICU+JQogIGdyb3VwX2J5KHJlYXNvbikgJT4lCiAgc3VtbWFyaXplKGMgPSBuKCkpICU+JQogIGFycmFuZ2UoZGVzYyhjKSkgJT4lCiAgZmlsdGVyKCkKYGBgCldlIHNlZSB0aGF0IFNhbG1vbmVsbGEgaXMgdGhlIG51bWJlciBvbmUgcmVhc29uIG92ZXIgdGhlIHBhc3QgMTAgeWVhcnMsIGJ1dCBsb29rIGF0IHRoZSByZWFzb24gY29sdW1uIG1vcmUgY2FyZWZ1bGx5LiBUaGVyZSBhcmUgc28gbWFueSBlbnRyaWVzIHN1Y2ggYXMgInByZXNlbmNlIG9mIHNhbG1vbmVsbGEiIHdoaWNoIHRlbGxzIHVzIHRoYXQgdGhlIGNvdW50IGlzIG1vcmUgdGhhbiAxMTgwLiAKCgoKCiMgRmluZCBhbGwgdGhlIGVudHJpZXMgd2hlcmUgdGhlIHdvcmQgIlNhbG1vbmVsbGEgb2NjdXJzIgpXZSB3aWxsIGNvbnZlcnQgYWxsIHRoZSBlbnRyaWVzIGludG8gbG93ZXJjYXNlIHNvIHRoYXQgd2UgY2FuIGZpbmQgdGhlIHdvcmQgInNhbG1vbmVsbGEiIGFueXdoZXJlIGluIHRoZSByZXNvbiBjb2x1bW4KYGBge3J9CmNvbXBsZXRlX2RmICU+JQogICBtdXRhdGUocmVhc29uPXRvbG93ZXIocmVhc29uKSkgJT4lCiAgIGZpbHRlcihncmVwbCgic2FsbW9uZWxsYSIscmVhc29uKSkgJT4lCiAgIHN1bW1hcml6ZShuPW4oKSkKYGBgClRoZSBjb3VudCB2YWx1ZSBmb3IgU2FsbW9uZWxsYSByb3NlIHRvIDEyNTUgdG8gMTE4MC4gCgoKCiMgRmluZCBhbGwgdGhlIGVudHJpZXMgd2hlcmUgdGhlIHdvcmQgIlNhbG1vbmVsbGEgb2NjdXJzIgpOb3cgd2Ugd2FudCB0byBmaW5kIHdoYXQgcHJvZHVjdHMgYXJlIGFzc29jaWF0ZWQgd2l0aCB0aGUgU2FsbW9uZWxsYSBjb250YW1pbmF0aW9uLiAKYGBge3J9CmNvbXBsZXRlX2RmICU+JQogICBtdXRhdGUocmVhc29uPXRvbG93ZXIocmVhc29uKSkgJT4lCiAgIGZpbHRlcihncmVwbCgic2FsbW9uZWxsYSIscmVhc29uKSkgJT4lCiAgIGdyb3VwX2J5KHJlYXNvbiwgcHJvZHVjdF9kZXNjcmlwdGlvbikgJT4lCiAgIHN1bW1hcml6ZShjID0gbigpKSAlPiUKICAgYXJyYW5nZShkZXNjKGMpKQpgYGAKUGlzdGFjaGlvcyBhcmUgdGhlIG51bWJlciBvbmUgcHJvZHVjdCB3aXRoIFNhbG1vbmVsbGEgY29udGFtaW5hdGlvbi4gCgoKCiMgRmluZCB0aGUgYnJhbmQgdGhhdCBpcyBhc3NvY2lhdGVkIHdpdGggdGhlIFNhbG1vbmVsbGEgY29udGFtaW5hdGlvbiBhbmQgUGlzdGFjaGlvcwpgYGB7cn0KY29tcGxldGVfZGYgJT4lCiAgIG11dGF0ZShyZWFzb249dG9sb3dlcihyZWFzb24pKSAlPiUKICAgbXV0YXRlKHByb2R1Y3RfZGVzY3JpcHRpb249dG9sb3dlcihwcm9kdWN0X2Rlc2NyaXB0aW9uKSkgJT4lCiAgIGZpbHRlcihncmVwbCgic2FsbW9uZWxsYSIscmVhc29uKSkgJT4lCiAgIGZpbHRlcihncmVwbCgicGlzdGFjaGlvIixwcm9kdWN0X2Rlc2NyaXB0aW9uKSkgJT4lCiAgIGdyb3VwX2J5KGJyYW5kX25hbWUsIFllYXIpICU+JQogICBzdW1tYXJpemUoYyA9IG4oKSkgJT4lCiAgIGFycmFuZ2UoZGVzYyhjKSkKYGBgCklzIHRoZSBjb3VudCBjb3JyZWN0PyBXaGF0IGNhbiBiZSBpbXByb3ZlZD8KCgojRmluZCByZWNhbGwgZ3JvdXBlZCBieSBjb21wYW55CmBgYHtyfQpjb21wbGV0ZV9kZiAlPiUKICBncm91cF9ieShjb21wYW55KSAlPiUKICBzdW1tYXJpemUoYyA9IG4oKSkgJT4lCiAgYXJyYW5nZShkZXNjKGMpKSAKYGBgCgoKCiMgRmluZCBhbGwgdGhlIHJlY2FsbHMgYnkgeWVhcgpgYGB7cn0KY29tcGxldGVfZGYgJT4lCiAgZ3JvdXBfYnkoWWVhcikgJT4lCiAgc3VtbWFyaXplKGMgPSBuKCkpICU+JQogIGFycmFuZ2UoZGVzYyhZZWFyKSkgJT4lCiAgZmlsdGVyKCkKYGBgCgoKCgojRmluZCByZWNhbGxzIGdyb3VwZWQgYnkgbW9udGgKYGBge3J9CmNvbXBsZXRlX2RmICU+JQogIGdyb3VwX2J5KE1vbnRoKSAlPiUKICBzdW1tYXJpemUoYyA9IG4oKSkgJT4lCiAgYXJyYW5nZShkZXNjKGMpKSAKYGBgCkZlYnJ1YXJ5IGhhcyB0aGUgaGlnaGVzdCByZWNhbGxzIG92ZXIgdGhlIGxhc3QgMTAgeWVhcgoKCgoKCgo=