library(tidyverse)
library(tidytext)
library(tm)
library(widyr)
library(igraph)
library(ggplot2)
library(ggraph)
library(readr)
library(tidygraph)

The following function(s) are defined below:

visualize_bigrams <- function(df_name, textfield, topic_title){
  
    # Create frequencies of bigrams
    df_cleaned <- df_name %>% 
      mutate(textfield_clean = removeWords(gsub("[^A-Za-z0-9 ]", "", {{textfield}}), stop_words$word))
    
    df_bigrams <- df_cleaned %>%
      unnest_tokens(bigrams, textfield_clean, token = "ngrams", n = 2)
    
    df_freq <- as.data.frame(table(df_bigrams$bigrams)) %>% 
      arrange(desc(Freq))
    
    # Visualizations
    df_top_bigrams <- df_freq %>%
      top_n(100, Freq) %>% 
      separate(Var1, c("word1", "word2"))
    
    top_bigram_words <- c(df_top_bigrams$word1, df_top_bigrams$word2) %>%
      unique()
    
    word_list <- df_cleaned %>%
      unnest_tokens(words, textfield_clean, token = "ngrams", n = 1) 
    
    df_word_list <- as.data.frame(table(word_list$words)) %>% 
      arrange(desc(Freq)) %>%
      filter(Var1 %in% top_bigram_words)
    
    names(df_word_list)[2] <- "Term_Frequency"
    names(df_top_bigrams)[3] <- "Edge_Frequency"
    
    graph_from_data_frame(vertices =  df_word_list, d = df_top_bigrams) -> graph_hold
    
    pl <- graph_hold %>%
      ggraph(layout = "fr") +
      geom_edge_link(aes(edge_alpha = Edge_Frequency), show.legend = TRUE) +
      geom_node_point(aes(color = Term_Frequency, size = Term_Frequency), alpha = 0.7) +
      scale_fill_viridis_c() +
      geom_node_text(aes(label = name), repel = TRUE) +
      scale_color_viridis_c(direction = -1) +
      theme_void() +
      guides(size=FALSE) +
      labs(title = quo_name(topic_title)) +
      theme(plot.title = element_text(size = 26, face = "bold"))
    
    ggsave(pl,filename = paste0("../figures/", "bigrams_", str_to_lower(str_replace(topic_title, "\\s", "_")), ".png")
           ,width = 12
           ,height = 8)
    pl
}

Read in the following data sources:

# Read in web scraped data
df_raw <- read_csv("../data/AllWebscrape.csv") %>% 
  rename_all(tolower) %>% 
  rename(term = topic) %>% 
  mutate(term = str_to_lower(term))

# Read in topics and search terms data
search_terms <- read_csv("../raw-data/SearchTerms.csv") %>% 
  rename_all(tolower) %>% 
  mutate(term = str_to_lower(term))

Join journal and search term dataframes to get topics information by journals.

The topic for global alignment sequence and read alignment sequence is hard coded because the corresponding information in the search terms dataframe does not have the word “sequence”, the topic cannot be joined by matching search terms.

df <- df_raw %>% 
  left_join(search_terms, by = "term") %>% 
  mutate(topic = ifelse(term %in% c("global alignment sequence", "read alignment sequence")
                        ,"Sequence Alignment"
                        ,topic))

Journals information is divided into 10 dataframes by topics. Visualizations can now we created by topic.

df_assembly <- df %>% 
  filter(topic == "Assembly")
df_databases <- df %>% 
  filter(topic == "Databases")
df_epigenetics <- df %>% 
  filter(topic == "Epigenetics")
df_geneexp <- df %>% 
  filter(topic == "Gene Expression")
df_genomeann <- df %>% 
  filter(topic == "Genome Annotation")
df_phylogenetics <- df %>% 
  filter(topic == "Phylogenetics")
df_seqal <- df %>% 
  filter(topic == "Sequence Alignment")
df_sequence <- df %>% 
  filter(topic == "Sequencing")
df_strucpred <- df %>% 
  filter(topic == "Structural Prediction")
df_varcall <- df %>% 
  filter(topic == "Variant Calling")

Create bigram plots to visualize the relationships between two words. The darker lines represent higher frequency of occurrence.

Assembly

# visualize_bigrams(df,abstract, "All Topics")
visualize_bigrams(df_assembly, abstract, "")

Databases

visualize_bigrams(df_databases, abstract, "")

Epigenetics

visualize_bigrams(df_epigenetics, abstract, "")

Gene Expression

visualize_bigrams(df_geneexp, abstract, "")

Genome Annotation

visualize_bigrams(df_genomeann, abstract, "")

Phylogenetics

visualize_bigrams(df_phylogenetics, abstract, "")

Sequence Alignment

visualize_bigrams(df_seqal, abstract, "")

Sequencing

visualize_bigrams(df_sequence, abstract, "")

Structural Prediction

visualize_bigrams(df_strucpred, abstract, "")

Variant Calling

visualize_bigrams(df_varcall, abstract, "")