Bigram Relationships

Assembly
Databases
Epigenetics
Gene Expression
Genome Annotation
Phylogenetics
Sequence Alignment
Sequencing
Structural Prediction
Variant Calling

library(tidyverse)
library(tidytext)
library(tm)
library(widyr)
library(igraph)
library(ggplot2)
library(ggraph)
library(readr)
library(tidygraph)

The following function(s) are defined below:

visualize_bigrams: extracts bigrams from a text field, calculates frequency of bigrams, and creates a bigram plot to visualize relationships between words
- df_name: name of dataframe that contains the text field of interest
- textfield: name of text field (ie. column name)

visualize_bigrams <- function(df_name, textfield, topic_title){
  
    # Create frequencies of bigrams
    df_cleaned <- df_name %>% 
      mutate(textfield_clean = removeWords(gsub("[^A-Za-z0-9 ]", "", {{textfield}}), stop_words$word))
    
    df_bigrams <- df_cleaned %>%
      unnest_tokens(bigrams, textfield_clean, token = "ngrams", n = 2)
    
    df_freq <- as.data.frame(table(df_bigrams$bigrams)) %>% 
      arrange(desc(Freq))
    
    # Visualizations
    df_top_bigrams <- df_freq %>%
      top_n(100, Freq) %>% 
      separate(Var1, c("word1", "word2"))
    
    top_bigram_words <- c(df_top_bigrams$word1, df_top_bigrams$word2) %>%
      unique()
    
    word_list <- df_cleaned %>%
      unnest_tokens(words, textfield_clean, token = "ngrams", n = 1) 
    
    df_word_list <- as.data.frame(table(word_list$words)) %>% 
      arrange(desc(Freq)) %>%
      filter(Var1 %in% top_bigram_words)
    
    names(df_word_list)[2] <- "Term_Frequency"
    names(df_top_bigrams)[3] <- "Edge_Frequency"
    
    graph_from_data_frame(vertices =  df_word_list, d = df_top_bigrams) -> graph_hold
    
    pl <- graph_hold %>%
      ggraph(layout = "fr") +
      geom_edge_link(aes(edge_alpha = Edge_Frequency), show.legend = TRUE) +
      geom_node_point(aes(color = Term_Frequency, size = Term_Frequency), alpha = 0.7) +
      scale_fill_viridis_c() +
      geom_node_text(aes(label = name), repel = TRUE) +
      scale_color_viridis_c(direction = -1) +
      theme_void() +
      guides(size=FALSE) +
      labs(title = quo_name(topic_title)) +
      theme(plot.title = element_text(size = 26, face = "bold"))
    
    ggsave(pl,filename = paste0("../figures/", "bigrams_", str_to_lower(str_replace(topic_title, "\\s", "_")), ".png")
           ,width = 12
           ,height = 8)
    pl
}

Read in the following data sources:

Web scraped data: this CSV contains information on journals and was created using database_parallel2.R
Topics and search terms: this CSV contains 10 topics and 3 search terms for each topic

# Read in web scraped data
df_raw <- read_csv("../data/AllWebscrape.csv") %>% 
  rename_all(tolower) %>% 
  rename(term = topic) %>% 
  mutate(term = str_to_lower(term))

# Read in topics and search terms data
search_terms <- read_csv("../raw-data/SearchTerms.csv") %>% 
  rename_all(tolower) %>% 
  mutate(term = str_to_lower(term))

Join journal and search term dataframes to get topics information by journals.

The topic for global alignment sequence and read alignment sequence is hard coded because the corresponding information in the search terms dataframe does not have the word “sequence”, the topic cannot be joined by matching search terms.

df <- df_raw %>% 
  left_join(search_terms, by = "term") %>% 
  mutate(topic = ifelse(term %in% c("global alignment sequence", "read alignment sequence")
                        ,"Sequence Alignment"
                        ,topic))

Journals information is divided into 10 dataframes by topics. Visualizations can now we created by topic.

df_assembly <- df %>% 
  filter(topic == "Assembly")
df_databases <- df %>% 
  filter(topic == "Databases")
df_epigenetics <- df %>% 
  filter(topic == "Epigenetics")
df_geneexp <- df %>% 
  filter(topic == "Gene Expression")
df_genomeann <- df %>% 
  filter(topic == "Genome Annotation")
df_phylogenetics <- df %>% 
  filter(topic == "Phylogenetics")
df_seqal <- df %>% 
  filter(topic == "Sequence Alignment")
df_sequence <- df %>% 
  filter(topic == "Sequencing")
df_strucpred <- df %>% 
  filter(topic == "Structural Prediction")
df_varcall <- df %>% 
  filter(topic == "Variant Calling")

Create bigram plots to visualize the relationships between two words. The darker lines represent higher frequency of occurrence.

Assembly

# visualize_bigrams(df,abstract, "All Topics")
visualize_bigrams(df_assembly, abstract, "")

Databases

visualize_bigrams(df_databases, abstract, "")

Epigenetics

visualize_bigrams(df_epigenetics, abstract, "")

Gene Expression

visualize_bigrams(df_geneexp, abstract, "")

Genome Annotation

visualize_bigrams(df_genomeann, abstract, "")

Phylogenetics

visualize_bigrams(df_phylogenetics, abstract, "")

Sequence Alignment

visualize_bigrams(df_seqal, abstract, "")

Sequencing

visualize_bigrams(df_sequence, abstract, "")

Structural Prediction

visualize_bigrams(df_strucpred, abstract, "")

Variant Calling

visualize_bigrams(df_varcall, abstract, "")