library(tidyverse)
library(data.table)
data.table 1.12.0 Latest news: r-datatable.com
Attaching package: ‘data.table’
The following objects are masked from ‘package:dplyr’:
between, first, last
The following object is masked from ‘package:purrr’:
transpose
library(janitor)
genomes_data <- data.table::fread("genome_data/prokaryotes.txt") %>%
janitor::clean_names()
Found and resolved improper quoting out-of-sample. First healed line 41875: <<Rhodococcus erythropolis DN1 1381122 PRJNA214035 214035 Terrabacteria group Actinobacteria 6.5484 62.4 - AUZK01 78 6163 5630 2013/08/23 2017/04/01 Contig "National Center for Biotechnology" RSE SAMN02470607 GCA_000454425.1 - ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/454/425/GCA_000454425.1_RodocDN1 24136850 DN1>>. If the fields are not quoted (e.g. field separator does not appear within any field), try quote="" to avoid this warning.
str(genomes_data)
Classes ‘data.table’ and 'data.frame': 183794 obs. of 23 variables:
$ number_organism_name : chr "Enterococcus faecium DO" "Bacillus mycoides" "Yersinia frederiksenii Y225" "Alteromonas macleodii ATCC 27126" ...
$ tax_id : int 333849 1405 1454377 529120 522373 40041 702459 465515 28152 537972 ...
$ bio_project_accession: chr "PRJNA30627" "PRJNA238211" "PRJNA236482" "PRJNA29793" ...
$ bio_project_id : int 30627 238211 236482 29793 30351 30767 42863 20655 240099 30075 ...
$ group : chr "Terrabacteria group" "Terrabacteria group" "Proteobacteria" "Proteobacteria" ...
$ sub_group : chr "Firmicutes" "Firmicutes" "Gammaproteobacteria" "Gammaproteobacteria" ...
$ size_mb : num 3.05 5.64 4.55 4.65 4.85 ...
$ gc_percent : chr "37.9158" "35.3849" "47.282" "44.7" ...
$ replicons : chr "chromosome:NC_017960.1/CP003583.1; plasmid 1:NC_017961.1/CP003584.1; plasmid 2:NC_017962.1/CP003585.1; plasmid "| __truncated__ "chromosome:NZ_CP009692.1/CP009692.1; plasmid pBMX_1:NZ_CP009691.1/CP009691.1; plasmid pBMX_2:NZ_CP009690.1/CP00"| __truncated__ "chromosome:NZ_CP009364.1/CP009364.1; plasmid unnsmrf:NZ_CP009363.1/CP009363.1" "chromosome:NC_018632.1/CP003841.1" ...
$ wgs : chr "-" "-" "-" "-" ...
$ scaffolds : chr "4" "4" "2" "1" ...
$ genes : chr "3209" "5903" "4239" "3966" ...
$ proteins : chr "3114" "5490" "3988" "3799" ...
$ release_date : chr "2012/05/25" "2015/02/05" "2015/02/09" "2012/09/21" ...
$ modify_date : chr "2016/08/03" "2017/04/03" "2017/04/03" "2017/05/18" ...
$ status : chr "Complete Genome" "Complete Genome" "Complete Genome" "Complete Genome" ...
$ center : chr "Baylor College of Medicine" "Los Alamos National Laboratory" "Los Alamos National Laboratory" "Evolucinary Genomics Group, Universidad Miguel Hernandez" ...
$ bio_sample_accession : chr "SAMN00002237" "SAMN03012838" "SAMN03010446" "SAMN02603229" ...
$ assembly_accession : chr "GCA_000174395.2" "GCA_000832605.1" "GCA_000834215.1" "GCA_000172635.2" ...
$ reference : chr "REFR" "REFR" "-" "REFR" ...
$ ftp_path : chr "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/174/395/GCA_000174395.2_ASM17439v2" "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/832/605/GCA_000832605.1_ASM83260v1" "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/834/215/GCA_000834215.1_ASM83421v1" "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/172/635/GCA_000172635.2_ASM17263v2" ...
$ pubmed_id : chr "22769602" "25931591" "25931590" "18670397,23209244" ...
$ strain : chr "DO" "ATCC 6462" "Y225" "ATCC 27126" ...
- attr(*, ".internal.selfref")=<externalptr>
genomes_data_df <- genomes_data %>%
janitor::clean_names() %>%
select(number_organism_name,
group,
sub_group,
strain,
size_mb,
gc_percent,
scaffolds,
genes,
proteins,
status,
release_date,
modify_date,
center
) %>%
mutate(genes=as.numeric(genes), proteins = as.numeric(proteins))
NAs introduced by coercionNAs introduced by coercion
genomes_data_df %>%
separate(release_date, c("year","month","day")) %>%
group_by(year) %>%
summarise(genomes_cnt = n()) %>%
ggplot(.,aes(x = year, y = genomes_cnt)) +
geom_bar(stat = "identity") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, hjust = 1))
NA
unique(genomes_data_df$status)
[1] "Complete Genome" "Scaffold" "Chromosome" "Contig" "Complete"
Complete Genome > Chromosome > Scaffold > Contig
genomes_data_df %>%
separate(release_date, c("year","month","day")) %>%
filter(grepl("Complete", status)) %>%
group_by(year) %>%
summarise(genomes_cnt = n()) %>%
ggplot(.,aes(x = year, y = genomes_cnt)) +
geom_bar(stat = "identity") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, hjust = 1))
genomes_data_df %>%
separate(release_date, c("year","month","day")) %>%
group_by(center) %>%
summarise(genomes_cnt = n()) %>%
arrange(desc(genomes_cnt)) %>%
filter(grepl("Baylor|BCM",center))
genomes_data_df %>%
separate(release_date, c("year","month","day")) %>%
filter(grepl("Complete| Chromosome", status)) %>%
group_by(center) %>%
summarise(genomes_cnt = n()) %>%
arrange(desc(genomes_cnt)) %>%
slice(1:10) %>%
ggplot(.,aes(x = reorder(center, -genomes_cnt), y = genomes_cnt)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
coord_flip()
genomes_data_df %>%
separate(release_date, c("year","month","day")) %>%
group_by(number_organism_name) %>%
summarise(genomes_cnt = n()) %>%
arrange(desc(genomes_cnt))
genomes_data_df %>%
filter(grepl("CDC|Centers for Disease Control and Prevention|Food And Drug",center)) %>%
group_by(number_organism_name) %>%
summarise(genomes_cnt = n()) %>%
arrange(desc(genomes_cnt))
GC content indicates the percent of G,C bases out of total four base - ATGC that makes DNA
genomes_data_df %>%
filter(grepl("CDC|Centers for Disease Control and Prevention|Food And Drug",center)) %>%
filter(status == "Complete Genome") %>%
mutate(gc_percent = as.numeric(gc_percent)) %>%
summarize(mean_gc = mean(gc_percent, na.rm = T))
NAs introduced by coercion
genomes_data_df %>%
filter(status == "Complete Genome") %>%
mutate(gc_percent = as.numeric(gc_percent)) %>%
group_by(sub_group) %>%
summarize(mean_gc = mean(gc_percent, na.rm = T)) %>%
arrange(desc(mean_gc))
NAs introduced by coercion
genomes_data_df %>%
filter(grepl("Complete", status)) %>%
mutate(size_mb = as.numeric(size_mb)) %>%
group_by(group) %>%
summarize(avg_size_mb = mean(size_mb, na.rm = T)) %>%
arrange(desc(avg_size_mb))
genomes_data_df %>%
filter(grepl("Complete", status)) %>%
mutate(size_mb = as.numeric(size_mb)) %>%
group_by(group,sub_group) %>%
summarize(avg_size_mb = mean(size_mb, na.rm = T)) %>%
arrange(desc(avg_size_mb))
genomes_data_df %>%
filter(grepl("Complete", status)) %>%
filter(size_mb < 5) %>%
ggplot(.,aes(x = size_mb, y = genes)) +
geom_point() +
theme_bw() +
geom_smooth(method = "lm", se = FALSE)
genomes_data_df %>%
filter(grepl("Complete", status)) %>%
filter(size_mb > 5) %>%
mutate(gc_percent = as.numeric(gc_percent)) %>%
ggplot(.,aes(x = size_mb, y = gc_percent)) +
geom_point() +
theme_bw() +
geom_smooth(method = "lm", se = TRUE)
NAs introduced by coercion