library(tidyverse)
library(tidytext)
library(tm)
library(widyr)
library(igraph)
library(ggplot2)
library(ggraph)
library(readr)
library(tidygraph)
The following function(s) are defined below:
visualize_bigrams <- function(df_name, textfield, topic_title){
# Create frequencies of bigrams
df_cleaned <- df_name %>%
mutate(textfield_clean = removeWords(gsub("[^A-Za-z0-9 ]", "", {{textfield}}), stop_words$word))
df_bigrams <- df_cleaned %>%
unnest_tokens(bigrams, textfield_clean, token = "ngrams", n = 2)
df_freq <- as.data.frame(table(df_bigrams$bigrams)) %>%
arrange(desc(Freq))
# Visualizations
df_top_bigrams <- df_freq %>%
top_n(100, Freq) %>%
separate(Var1, c("word1", "word2"))
top_bigram_words <- c(df_top_bigrams$word1, df_top_bigrams$word2) %>%
unique()
word_list <- df_cleaned %>%
unnest_tokens(words, textfield_clean, token = "ngrams", n = 1)
df_word_list <- as.data.frame(table(word_list$words)) %>%
arrange(desc(Freq)) %>%
filter(Var1 %in% top_bigram_words)
names(df_word_list)[2] <- "Term_Frequency"
names(df_top_bigrams)[3] <- "Edge_Frequency"
graph_from_data_frame(vertices = df_word_list, d = df_top_bigrams) -> graph_hold
pl <- graph_hold %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = Edge_Frequency), show.legend = TRUE) +
geom_node_point(aes(color = Term_Frequency, size = Term_Frequency), alpha = 0.7) +
scale_fill_viridis_c() +
geom_node_text(aes(label = name), repel = TRUE) +
scale_color_viridis_c(direction = -1) +
theme_void() +
guides(size=FALSE) +
labs(title = quo_name(topic_title)) +
theme(plot.title = element_text(size = 26, face = "bold"))
ggsave(pl,filename = paste0("../figures/", "bigrams_", str_to_lower(str_replace(topic_title, "\\s", "_")), ".png")
,width = 12
,height = 8)
pl
}
Read in the following data sources:
# Read in web scraped data
df_raw <- read_csv("../data/AllWebscrape.csv") %>%
rename_all(tolower) %>%
rename(term = topic) %>%
mutate(term = str_to_lower(term))
# Read in topics and search terms data
search_terms <- read_csv("../raw-data/SearchTerms.csv") %>%
rename_all(tolower) %>%
mutate(term = str_to_lower(term))
Join journal and search term dataframes to get topics information by journals.
The topic for global alignment sequence and read alignment sequence is hard coded because the corresponding information in the search terms dataframe does not have the word “sequence”, the topic cannot be joined by matching search terms.
df <- df_raw %>%
left_join(search_terms, by = "term") %>%
mutate(topic = ifelse(term %in% c("global alignment sequence", "read alignment sequence")
,"Sequence Alignment"
,topic))
Journals information is divided into 10 dataframes by topics. Visualizations can now we created by topic.
df_assembly <- df %>%
filter(topic == "Assembly")
df_databases <- df %>%
filter(topic == "Databases")
df_epigenetics <- df %>%
filter(topic == "Epigenetics")
df_geneexp <- df %>%
filter(topic == "Gene Expression")
df_genomeann <- df %>%
filter(topic == "Genome Annotation")
df_phylogenetics <- df %>%
filter(topic == "Phylogenetics")
df_seqal <- df %>%
filter(topic == "Sequence Alignment")
df_sequence <- df %>%
filter(topic == "Sequencing")
df_strucpred <- df %>%
filter(topic == "Structural Prediction")
df_varcall <- df %>%
filter(topic == "Variant Calling")
Create bigram plots to visualize the relationships between two words. The darker lines represent higher frequency of occurrence.
# visualize_bigrams(df,abstract, "All Topics")
visualize_bigrams(df_assembly, abstract, "")
visualize_bigrams(df_databases, abstract, "")
visualize_bigrams(df_epigenetics, abstract, "")
visualize_bigrams(df_geneexp, abstract, "")
visualize_bigrams(df_genomeann, abstract, "")
visualize_bigrams(df_phylogenetics, abstract, "")
visualize_bigrams(df_seqal, abstract, "")
visualize_bigrams(df_sequence, abstract, "")
visualize_bigrams(df_strucpred, abstract, "")
visualize_bigrams(df_varcall, abstract, "")