library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0 ✔ purrr 0.3.0
## ✔ tibble 2.0.1 ✔ dplyr 0.8.0
## ✔ tidyr 0.8.2 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.3.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
library(janitor)
genomes_data <- data.table::fread("genome_data/prokaryotes.txt") %>%
janitor::clean_names()
## Warning in data.table::fread("genome_data/prokaryotes.txt"): Found
## and resolved improper quoting out-of-sample. First healed line 41875:
## <<Rhodococcus erythropolis DN1 1381122 PRJNA214035 214035 Terrabacteria
## group Actinobacteria 6.5484 62.4 - AUZK01 78 6163 5630 2013/08/23
## 2017/04/01 Contig "National Center for Biotechnology" RSE SAMN02470607
## GCA_000454425.1 - ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/454/425/
## GCA_000454425.1_RodocDN1 24136850 DN1>>. If the fields are not quoted (e.g.
## field separator does not appear within any field), try quote="" to avoid
## this warning.
str(genomes_data)
## Classes 'data.table' and 'data.frame': 183794 obs. of 23 variables:
## $ number_organism_name : chr "Enterococcus faecium DO" "Bacillus mycoides" "Yersinia frederiksenii Y225" "Alteromonas macleodii ATCC 27126" ...
## $ tax_id : int 333849 1405 1454377 529120 522373 40041 702459 465515 28152 537972 ...
## $ bio_project_accession: chr "PRJNA30627" "PRJNA238211" "PRJNA236482" "PRJNA29793" ...
## $ bio_project_id : int 30627 238211 236482 29793 30351 30767 42863 20655 240099 30075 ...
## $ group : chr "Terrabacteria group" "Terrabacteria group" "Proteobacteria" "Proteobacteria" ...
## $ sub_group : chr "Firmicutes" "Firmicutes" "Gammaproteobacteria" "Gammaproteobacteria" ...
## $ size_mb : num 3.05 5.64 4.55 4.65 4.85 ...
## $ gc_percent : chr "37.9158" "35.3849" "47.282" "44.7" ...
## $ replicons : chr "chromosome:NC_017960.1/CP003583.1; plasmid 1:NC_017961.1/CP003584.1; plasmid 2:NC_017962.1/CP003585.1; plasmid "| __truncated__ "chromosome:NZ_CP009692.1/CP009692.1; plasmid pBMX_1:NZ_CP009691.1/CP009691.1; plasmid pBMX_2:NZ_CP009690.1/CP00"| __truncated__ "chromosome:NZ_CP009364.1/CP009364.1; plasmid unnsmrf:NZ_CP009363.1/CP009363.1" "chromosome:NC_018632.1/CP003841.1" ...
## $ wgs : chr "-" "-" "-" "-" ...
## $ scaffolds : chr "4" "4" "2" "1" ...
## $ genes : chr "3209" "5903" "4239" "3966" ...
## $ proteins : chr "3114" "5490" "3988" "3799" ...
## $ release_date : chr "2012/05/25" "2015/02/05" "2015/02/09" "2012/09/21" ...
## $ modify_date : chr "2016/08/03" "2017/04/03" "2017/04/03" "2017/05/18" ...
## $ status : chr "Complete Genome" "Complete Genome" "Complete Genome" "Complete Genome" ...
## $ center : chr "Baylor College of Medicine" "Los Alamos National Laboratory" "Los Alamos National Laboratory" "Evolucinary Genomics Group, Universidad Miguel Hernandez" ...
## $ bio_sample_accession : chr "SAMN00002237" "SAMN03012838" "SAMN03010446" "SAMN02603229" ...
## $ assembly_accession : chr "GCA_000174395.2" "GCA_000832605.1" "GCA_000834215.1" "GCA_000172635.2" ...
## $ reference : chr "REFR" "REFR" "-" "REFR" ...
## $ ftp_path : chr "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/174/395/GCA_000174395.2_ASM17439v2" "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/832/605/GCA_000832605.1_ASM83260v1" "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/834/215/GCA_000834215.1_ASM83421v1" "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/172/635/GCA_000172635.2_ASM17263v2" ...
## $ pubmed_id : chr "22769602" "25931591" "25931590" "18670397,23209244" ...
## $ strain : chr "DO" "ATCC 6462" "Y225" "ATCC 27126" ...
## - attr(*, ".internal.selfref")=<externalptr>
genomes_data_df <- genomes_data %>%
janitor::clean_names() %>%
select(number_organism_name,
group,
sub_group,
strain,
size_mb,
gc_percent,
scaffolds,
genes,
proteins,
status,
release_date,
modify_date,
center
) %>%
mutate(genes=as.numeric(genes), proteins = as.numeric(proteins))
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
genomes_data_df %>%
separate(release_date, c("year","month","day")) %>%
group_by(year) %>%
summarise(genomes_cnt = n()) %>%
ggplot(.,aes(x = year, y = genomes_cnt)) +
geom_bar(stat = "identity") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, hjust = 1))
unique(genomes_data_df$status)
## [1] "Complete Genome" "Scaffold" "Chromosome" "Contig"
## [5] "Complete"
Complete Genome \(>\) Chromosome \(>\) Scaffold \(>\) Contig
genomes_data_df %>%
separate(release_date, c("year","month","day")) %>%
filter(grepl("Complete", status)) %>%
group_by(year) %>%
summarise(genomes_cnt = n()) %>%
ggplot(.,aes(x = year, y = genomes_cnt)) +
geom_bar(stat = "identity") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, hjust = 1))
genomes_data_df %>%
separate(release_date, c("year","month","day")) %>%
group_by(center) %>%
summarise(genomes_cnt = n()) %>%
arrange(desc(genomes_cnt)) %>%
filter(grepl("Baylor|BCM",center))
genomes_data_df %>%
separate(release_date, c("year","month","day")) %>%
filter(grepl("Complete| Chromosome", status)) %>%
group_by(center) %>%
summarise(genomes_cnt = n()) %>%
arrange(desc(genomes_cnt)) %>%
slice(1:10) %>%
ggplot(.,aes(x = reorder(center, -genomes_cnt), y = genomes_cnt)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
coord_flip()
genomes_data_df %>%
separate(release_date, c("year","month","day")) %>%
group_by(number_organism_name) %>%
summarise(genomes_cnt = n()) %>%
arrange(desc(genomes_cnt))
genomes_data_df %>%
filter(grepl("CDC|Centers for Disease Control and Prevention|Food And Drug",center)) %>%
group_by(number_organism_name) %>%
summarise(genomes_cnt = n()) %>%
arrange(desc(genomes_cnt))
GC content indicates the percent of G,C bases out of total four base - ATGC that makes DNA
genomes_data_df %>%
filter(grepl("CDC|Centers for Disease Control and Prevention|Food And Drug",center)) %>%
filter(status == "Complete Genome") %>%
mutate(gc_percent = as.numeric(gc_percent)) %>%
summarize(mean_gc = mean(gc_percent, na.rm = T))
## Warning: NAs introduced by coercion
genomes_data_df %>%
filter(status == "Complete Genome") %>%
mutate(gc_percent = as.numeric(gc_percent)) %>%
group_by(sub_group) %>%
summarize(mean_gc = mean(gc_percent, na.rm = T)) %>%
arrange(desc(mean_gc))
## Warning: NAs introduced by coercion
genomes_data_df %>%
filter(grepl("Complete", status)) %>%
mutate(size_mb = as.numeric(size_mb)) %>%
group_by(group) %>%
summarize(avg_size_mb = mean(size_mb, na.rm = T)) %>%
arrange(desc(avg_size_mb))
genomes_data_df %>%
filter(grepl("Complete", status)) %>%
mutate(size_mb = as.numeric(size_mb)) %>%
group_by(group,sub_group) %>%
summarize(avg_size_mb = mean(size_mb, na.rm = T)) %>%
arrange(desc(avg_size_mb))
genomes_data_df %>%
filter(grepl("Complete", status)) %>%
filter(size_mb < 5) %>%
ggplot(.,aes(x = size_mb, y = genes)) +
geom_point() +
theme_bw() +
geom_smooth(method = "lm", se = FALSE)
## Warning: Removed 123 rows containing non-finite values (stat_smooth).
## Warning: Removed 123 rows containing missing values (geom_point).
genomes_data_df %>%
filter(grepl("Complete", status)) %>%
filter(size_mb > 5) %>%
mutate(gc_percent = as.numeric(gc_percent)) %>%
ggplot(.,aes(x = size_mb, y = gc_percent)) +
geom_point() +
theme_bw() +
geom_smooth(method = "lm", se = TRUE)
## Warning: NAs introduced by coercion
## Warning: Removed 41 rows containing non-finite values (stat_smooth).
## Warning: Removed 41 rows containing missing values (geom_point).