Load library

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0     ✔ purrr   0.3.0
## ✔ tibble  2.0.1     ✔ dplyr   0.8.0
## ✔ tidyr   0.8.2     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.3.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
library(janitor)

Read data

genomes_data <- data.table::fread("genome_data/prokaryotes.txt") %>%
  janitor::clean_names()
## Warning in data.table::fread("genome_data/prokaryotes.txt"): Found
## and resolved improper quoting out-of-sample. First healed line 41875:
## <<Rhodococcus erythropolis DN1 1381122 PRJNA214035 214035 Terrabacteria
## group Actinobacteria 6.5484 62.4 - AUZK01 78 6163 5630 2013/08/23
## 2017/04/01 Contig "National Center for Biotechnology" RSE SAMN02470607
## GCA_000454425.1 - ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/454/425/
## GCA_000454425.1_RodocDN1 24136850 DN1>>. If the fields are not quoted (e.g.
## field separator does not appear within any field), try quote="" to avoid
## this warning.
str(genomes_data)
## Classes 'data.table' and 'data.frame':   183794 obs. of  23 variables:
##  $ number_organism_name : chr  "Enterococcus faecium DO" "Bacillus mycoides" "Yersinia frederiksenii Y225" "Alteromonas macleodii ATCC 27126" ...
##  $ tax_id               : int  333849 1405 1454377 529120 522373 40041 702459 465515 28152 537972 ...
##  $ bio_project_accession: chr  "PRJNA30627" "PRJNA238211" "PRJNA236482" "PRJNA29793" ...
##  $ bio_project_id       : int  30627 238211 236482 29793 30351 30767 42863 20655 240099 30075 ...
##  $ group                : chr  "Terrabacteria group" "Terrabacteria group" "Proteobacteria" "Proteobacteria" ...
##  $ sub_group            : chr  "Firmicutes" "Firmicutes" "Gammaproteobacteria" "Gammaproteobacteria" ...
##  $ size_mb              : num  3.05 5.64 4.55 4.65 4.85 ...
##  $ gc_percent           : chr  "37.9158" "35.3849" "47.282" "44.7" ...
##  $ replicons            : chr  "chromosome:NC_017960.1/CP003583.1; plasmid 1:NC_017961.1/CP003584.1; plasmid 2:NC_017962.1/CP003585.1; plasmid "| __truncated__ "chromosome:NZ_CP009692.1/CP009692.1; plasmid pBMX_1:NZ_CP009691.1/CP009691.1; plasmid pBMX_2:NZ_CP009690.1/CP00"| __truncated__ "chromosome:NZ_CP009364.1/CP009364.1; plasmid unnsmrf:NZ_CP009363.1/CP009363.1" "chromosome:NC_018632.1/CP003841.1" ...
##  $ wgs                  : chr  "-" "-" "-" "-" ...
##  $ scaffolds            : chr  "4" "4" "2" "1" ...
##  $ genes                : chr  "3209" "5903" "4239" "3966" ...
##  $ proteins             : chr  "3114" "5490" "3988" "3799" ...
##  $ release_date         : chr  "2012/05/25" "2015/02/05" "2015/02/09" "2012/09/21" ...
##  $ modify_date          : chr  "2016/08/03" "2017/04/03" "2017/04/03" "2017/05/18" ...
##  $ status               : chr  "Complete Genome" "Complete Genome" "Complete Genome" "Complete Genome" ...
##  $ center               : chr  "Baylor College of Medicine" "Los Alamos National Laboratory" "Los Alamos National Laboratory" "Evolucinary Genomics Group, Universidad Miguel Hernandez" ...
##  $ bio_sample_accession : chr  "SAMN00002237" "SAMN03012838" "SAMN03010446" "SAMN02603229" ...
##  $ assembly_accession   : chr  "GCA_000174395.2" "GCA_000832605.1" "GCA_000834215.1" "GCA_000172635.2" ...
##  $ reference            : chr  "REFR" "REFR" "-" "REFR" ...
##  $ ftp_path             : chr  "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/174/395/GCA_000174395.2_ASM17439v2" "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/832/605/GCA_000832605.1_ASM83260v1" "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/834/215/GCA_000834215.1_ASM83421v1" "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/172/635/GCA_000172635.2_ASM17263v2" ...
##  $ pubmed_id            : chr  "22769602" "25931591" "25931590" "18670397,23209244" ...
##  $ strain               : chr  "DO" "ATCC 6462" "Y225" "ATCC 27126" ...
##  - attr(*, ".internal.selfref")=<externalptr>

Fix the names and select columns we need

genomes_data_df <- genomes_data %>%
  janitor::clean_names() %>%
  select(number_organism_name,
         group,
         sub_group,
         strain,
         size_mb,
         gc_percent,
         scaffolds,
         genes,
         proteins,
         status,
         release_date,
         modify_date,
         center
         ) %>%
  mutate(genes=as.numeric(genes), proteins = as.numeric(proteins))
## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

Find number of genomes released by year

genomes_data_df %>%
    separate(release_date, c("year","month","day")) %>%
    group_by(year) %>%
    summarise(genomes_cnt = n()) %>%
    ggplot(.,aes(x = year, y = genomes_cnt)) +
    geom_bar(stat = "identity") +
    theme_minimal()+
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) 

Find number of complete genomes by year

unique(genomes_data_df$status)
## [1] "Complete Genome" "Scaffold"        "Chromosome"      "Contig"         
## [5] "Complete"

Complete Genome \(>\) Chromosome \(>\) Scaffold \(>\) Contig

genomes_data_df %>%
    separate(release_date, c("year","month","day")) %>%
    filter(grepl("Complete", status)) %>%
    group_by(year) %>%
    summarise(genomes_cnt = n()) %>%
    ggplot(.,aes(x = year, y = genomes_cnt)) +
    geom_bar(stat = "identity") +
    theme_minimal()+
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) 

Find number of genomes released by Baylor College of Medicine

genomes_data_df %>%
    separate(release_date, c("year","month","day")) %>%
    group_by(center) %>%
    summarise(genomes_cnt = n()) %>%
    arrange(desc(genomes_cnt)) %>%
    filter(grepl("Baylor|BCM",center))

Find top 10 submitter of complete genomes.

genomes_data_df %>%
    separate(release_date, c("year","month","day")) %>%
    filter(grepl("Complete| Chromosome", status)) %>%
    group_by(center) %>%
    summarise(genomes_cnt = n()) %>%
    arrange(desc(genomes_cnt)) %>%
    slice(1:10) %>%
    ggplot(.,aes(x = reorder(center, -genomes_cnt), y = genomes_cnt)) +
    geom_bar(stat = "identity") +
    theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
    coord_flip()

Find the most submitted genome bacterium

genomes_data_df %>%
    separate(release_date, c("year","month","day")) %>%
    group_by(number_organism_name) %>%
    summarise(genomes_cnt = n()) %>%
    arrange(desc(genomes_cnt))

Find the most submitted genome by CDC

genomes_data_df %>%
  filter(grepl("CDC|Centers for Disease Control and Prevention|Food And Drug",center)) %>%
   group_by(number_organism_name) %>%
    summarise(genomes_cnt = n()) %>%
    arrange(desc(genomes_cnt))

What is the GC content of the bacteria sequenced by the CDC as compared to overall GC content?

GC content indicates the percent of G,C bases out of total four base - ATGC that makes DNA

genomes_data_df %>%
  filter(grepl("CDC|Centers for Disease Control and Prevention|Food And Drug",center)) %>%
  filter(status == "Complete Genome") %>%
  mutate(gc_percent = as.numeric(gc_percent)) %>%
  summarize(mean_gc = mean(gc_percent, na.rm = T))
## Warning: NAs introduced by coercion

Calculate the GC percent by the group

genomes_data_df %>%
  filter(status == "Complete Genome") %>%
  mutate(gc_percent = as.numeric(gc_percent)) %>%
  group_by(sub_group) %>%
  summarize(mean_gc = mean(gc_percent, na.rm = T)) %>%
  arrange(desc(mean_gc))
## Warning: NAs introduced by coercion

Calculate the size by the group

genomes_data_df %>%
  filter(grepl("Complete", status)) %>%
  mutate(size_mb = as.numeric(size_mb)) %>%
  group_by(group) %>%
  summarize(avg_size_mb = mean(size_mb, na.rm = T)) %>%
  arrange(desc(avg_size_mb))

Calculate the size by the group and sub_group

genomes_data_df %>%
  filter(grepl("Complete", status)) %>%
  mutate(size_mb = as.numeric(size_mb)) %>%
  group_by(group,sub_group) %>%
  summarize(avg_size_mb = mean(size_mb, na.rm = T)) %>%
  arrange(desc(avg_size_mb))

What is the relation between genome size and genes?

genomes_data_df %>% 
filter(grepl("Complete", status)) %>%
filter(size_mb < 5) %>%
ggplot(.,aes(x = size_mb, y = genes)) +
geom_point() +
theme_bw() +
geom_smooth(method = "lm", se = FALSE)
## Warning: Removed 123 rows containing non-finite values (stat_smooth).
## Warning: Removed 123 rows containing missing values (geom_point).

What is the relation between genome size and GC-content?

genomes_data_df %>% 
filter(grepl("Complete", status)) %>%
filter(size_mb > 5) %>%
mutate(gc_percent = as.numeric(gc_percent)) %>%
ggplot(.,aes(x = size_mb, y = gc_percent)) +
geom_point() +
theme_bw() +
geom_smooth(method = "lm", se = TRUE)
## Warning: NAs introduced by coercion
## Warning: Removed 41 rows containing non-finite values (stat_smooth).
## Warning: Removed 41 rows containing missing values (geom_point).