NORTHWEST PATHOGEN GENOMICS CENTER OF EXCELLENCE

Data Catalog

This is a static website and demo of a data catalog. Search for data objects in the metadata viewer, or visualize data product relationships in the directed acyclic graph (DAG) below.

dag metdata viewer

#| standalone: true
#| #components: [editor, viewer]
#| viewerHeight: 1000
#| column: page


## file: app.R
# import dag
library(shiny)
library(DiagrammeR)
library(dplyr)
library(tidyr)

# Read the CSV file when the script starts
metadata <- read.csv(url("https://raw.githubusercontent.com/NW-PaGe/dstt_catalog_demo/refs/heads/main/metadata.csv")) %>%
  mutate(across(where(is.character), function(.) ifelse(. == '', NA_character_, .)))

# Define UI for the app
dag_ui <- fluidPage(
  sidebarLayout(
    sidebarPanel(
      h4("Select nodes"),
      checkboxGroupInput("selected_nodes", "Nodes", choices = unique(metadata$Product_Name), selected = unique(metadata$Product_Name)),
      hr()
    ),
    
    mainPanel(
      grVizOutput("dag_plot", width = "100%", height = "600px")
    )
  )
)

# Define server logic for the app
dag_server <- function(input, output, session) {
  
  # Create a reactive DAG from selected nodes
  reactive_dag <- reactive({
    ### Create nodes based on inclusion in selected input
    nodes_final <- metadata %>% 
      filter(Product_Name %in% input$selected_nodes) %>%
      select(Product_Name) %>% 
      mutate(Product_Name_Copy = Product_Name) %>% 
      distinct()
    
    ### Create edges based on `Parent` and `Child` columns ----------------
    
    # unnest any nodes with 2+ parents or children
    unnested_data <- metadata %>% 
      select(Product_Name, Parent, Child) %>% 
      mutate(Product_Name_Copy = Product_Name) %>% 
      tidyr::separate_rows(Parent, sep = ";") %>% 
      tidyr::separate_rows(Child, sep = ";") %>% 
      distinct()
    
    # identify all possible parents
    parent_df <- unnested_data %>% 
      filter(!is.na(Parent)) %>% 
      mutate(Child = Product_Name)

    # identify all possible children
    child_df <- unnested_data %>% 
      filter(!is.na(Child)) %>% 
      mutate(Parent = Product_Name)
    
    # join parents and children to get all possible edges
    edges <- rbind(parent_df, child_df) %>% 
      select(Parent, Child) %>% 
      distinct()
    
    # get direct edges (edges where no product has been deselected)
    direct_edges <- edges %>% 
      filter(Parent %in% input$selected_nodes & Child %in% input$selected_nodes) %>% 
      mutate(style = 'solid') # set edge attribute so line will be solid
    
    # get indirect edges (edges where child or parent has been deselected)
    indirect_parent <- edges %>% 
      filter(!Parent %in% input$selected_nodes) %>% 
      rename(removed=Parent)
    indirect_child <- edges %>% 
      filter(!Child %in% input$selected_nodes) %>% 
      rename(removed=Child)
    indirect_edges <- indirect_parent %>% 
      left_join(indirect_child, by='removed') %>% 
      select(-removed) %>% 
      mutate(style = 'dashed') # set edge attribute so line will be dashed
    
    # concat dfs for final edges df with direct/indirect relationships included
    edges_final <- rbind(indirect_edges, direct_edges)
    
    ### Create the DAG using DiagrammeR ------------------------------------------
    dag <- create_graph() %>%
      add_nodes_from_table(table = nodes_final,
                           label_col = "Product_Name") %>%
      add_edges_from_table(table = edges_final,
                           from_col = "Parent",
                           to_col = "Child",
                           from_to_map = "Product_Name_Copy")

    return(dag)
  })
  
  observe({
    # Render the DAG plot
    output$dag_plot <- renderGrViz({
      dag <- reactive_dag()
      dag %>% render_graph()
    })  
  })
  
}

# # render shinyapp
# shinyApp(ui=dag_ui, server=dag_server)

# import docx_to_html function
docx_to_html <- function(doc) {
  #' Convert docx to html 
  #' 
  #' Return a semi formatted html version of a docx file. This function
  #' accepts docx objects (output by officer::read_docx) and outputs an html-marked character element
  #' with some headers and tables formatted. Media will not be included in output.
  #' 
  #' @param doc docx object output from officer::read_docx
  #' @seealso [officer::read_docx()]
  #' 
  
  
  library(officer)
  library(dplyr)
  library(htmltools)
  
  doc_content <- officer::docx_summary(doc) # convert doc to data frame with rows of docx elements
  
  doc_content <- dplyr::arrange(doc_content, doc_index, row_id, cell_id) # arrange df by index, then row id, then cell id
  
  html_content <- "" # Initialize html content element
  
  is_table <- FALSE # Set flag to check if a table is being constructed
  
  # Step through the document content rowwise to render paragraphs, headers, tables, and page breaks
  for (i in 1:nrow(doc_content)) {
    # close the row and table if the previous row was part of a table
    if (is_table & !doc_content$content_type[i] %in% "table cell") {
      html_content <- paste0(html_content, "</tr></table>")  # Close row and table
      is_table <- FALSE # reset to false
    }
    # if row is paragraph type ...
    if (doc_content$content_type[i] %in% "paragraph") {
      # insert a horizontal line if a page break is present
      if (tolower(doc_content$text[i]) %in% 'page break') {
        html_content <- paste0(html_content, htmltools::hr())
      # otherwise, if a heading style, insert text with a heading tag (add +1 to heading # as we assign title as h1 later)  
      } else if (grepl("heading [1-5]", doc_content$style_name[i])) {
        header = gsub("heading ([1-5])", "\\1", doc_content$style_name[i])
        html_content <- paste0(
          html_content,
          switch(as.numeric(header)+1,
                 htmltools::h1(doc_content$text[i]),
                 htmltools::h2(doc_content$text[i]),
                 htmltools::h3(doc_content$text[i]),
                 htmltools::h4(doc_content$text[i]),
                 htmltools::h5(doc_content$text[i]),
                 htmltools::h6(doc_content$text[i])
          )
        )
      # otherwise, if a title style, insert text with h1 tags
      } else if (tolower(doc_content$style_name[i]) %in% "title") {
        html_content <- paste0(html_content, htmltools::h1(doc_content$text[i]))
      # otherwise, insert text with paragraph tags
      } else {
        html_content <- paste0(html_content, htmltools::p(doc_content$text[i]))
      }
    # otherwise (if not a paragraph type), if it is a table cell in row 1 cell 1 (meaning, the first cell of a table), insert the table border
    } else if (doc_content$content_type[i] %in% "table cell" & doc_content$row_id[i]%in%1 & doc_content$cell_id[i]%in%1) {
      html_content <- paste0(html_content, "<table border='1'>")  # Start a table
      is_table <- TRUE # set table flag to true
    # otherwise, if table cell and the first cell of a new row, add a new row tag
    } else if (doc_content$content_type[i] %in% "table cell" & doc_content$row_id[i] > doc_content$row_id[i-1]) {
      html_content <- paste0(html_content, "</tr><tr>")  # Close prev row and start a new row
    # insert text if not paragraph of table cell w/o formatting
    } else if (!doc_content$content_type[i] %in% c("paragraph", "table cell")) {
      html_content <- paste0(html_content, doc_content$text[i])
    }
    # if a table cell, insert text with td tag
    if (doc_content$content_type[i] %in% "table cell") {
      html_content <- paste0(html_content, "<td>", doc_content$text[i], "</td>")  # Add cell content
    }
  }        
  
  # convert character element to element marked as HTML
  HTML(html_content)
}

# import split_values.R
library(stringr)

split_values <- function(x, sep=';', unlist=FALSE) {
  #' split_values
  #' 
  #' Function splits char elements inside char vectors and removes surrounding whitespace
  #' 
  #' @param x character or vector of characters
  #' @param sep string to split on; default ";"
  #' @param unlist whether or not to unlist the character list; default FALSE
  #' @return a list (or vector, if unlist=TRUE) of character elements
  #' 
  #' @example split_values(x=c('this', 'string; is', 'split')) 
  #' @example split_values(x=c('This will be return as a vector'), sep=',', unlist=TRUE)
  #' 
  x = stringr::str_split(x, sep) # split by sep and return string
  x = sapply(x, trimws) # trim whitespace of strings in list
  if (unlist) {
    x = unlist(x)
  }
  return(x)
}

 # import server & ui code
library(shiny)
library(DT)
library(officer)
library(duckdb)
library(jsonlite)

# Read the CSV file when the script starts
metadata <- read.csv(url("https://raw.githubusercontent.com/NW-PaGe/dstt_catalog_demo/refs/heads/main/metadata.csv")) %>%
  mutate(across(where(is.character), function(.) ifelse(. == '', NA_character_, .)))

# Define the UI
metadata_ui <- fluidPage(
  # Add custom CSS
  tags$head(
    tags$style(HTML("
      .filter-tag {
        display: inline-block;
        margin: 3px;
        padding: 6px 12px;
        background-color: #3498db;
        color: white;
        border-radius: 15px;
        cursor: pointer;
        font-size: 14px;
        transition: background-color 0.3s ease;
      }
      .filter-tag:hover {
        background-color: #2980b9;
      }
      .filter-label {
        font-weight: bold;
        margin-right: 5px;
      }
      .filter-value {
        font-weight: normal;
      }
      .remove-icon {
        margin-left: 8px;
        font-weight: bold;
        opacity: 0.8;
      }
      .filter-tag:hover .remove-icon {
        opacity: 1;
      }
      #fileListContent {
        max-height: none;
        overflow-y: hidden;
      }
      #fileListContent li:nth-child(n+7) {
        display: none;
      }
      #fileListContent li:nth-child(6) {
        white-space: nowrap;
        overflow: visible;
        text-overflow: ellipsis;
        opacity: 0.7;
        position: relative;
        padding-right: 20px;
        list-style-type: disc !important;
      }
      #fileListContent li:nth-child(6) a {
        overflow: hidden;
        text-overflow: ellipsis;
        white-space: nowrap;
        display: block;
      }
      #fileListContent li:nth-child(6)::after {
        content: '';
        position: absolute;
        bottom: 0;
        left: -20px;
        right: 0;
        height: 60%;
        background: linear-gradient(to bottom, transparent, white);
        pointer-events: none;
      }
      #fileListContent.expanded li {
        display: list-item !important;
        white-space: normal !important;
        opacity: 1 !important;
      }
      #fileListContent.expanded li:nth-child(6)::after {
        display: none;
      }
      #toggleBtn {
        color: #3498db;
        cursor: pointer;
        border: none;
        background: none;
        padding: 5px 10px;
        width: 100%;
        text-align: center;
        margin-top: 10px;
        font-size: 16px;
      }
      .file-list-ul {
        list-style-type: disc !important;
        padding-left: 20px;
        margin: 0;
      }
      .file-list-ul li {
        list-style-type: disc !important;
      }
    ")),
    tags$script(HTML("
      function toggleFileList() {
        var content = document.getElementById('fileListContent');
        var btn = document.getElementById('toggleBtn');
        if (content.classList.contains('expanded')) {
          content.classList.remove('expanded');
          btn.innerHTML = '▾';
        } else {
          content.classList.add('expanded');
          btn.innerHTML = '▴';
        }
      }
    "))
  ),
  sidebarLayout(
    sidebarPanel(
      # Static text inputs for certain columns
      textInput("product_id", "Product ID", ""),
      textInput("product_name", "Product Name", ""),
      textInput("keywords", "Keywords", "", placeholder = "variant, genbank, lineage"),


      
      # Static text inputs dynamically pulled from metadata.csv for certain columns
      selectInput("location", "Location", choices = c("All", unique(split_values(metadata$Location, unlist=T))), selected = "All"),
      selectInput("steward", "Steward", choices = c("All", unique(split_values(metadata$Steward, unlist=T))), selected = "All"),
      selectInput("users", "Users", choices = c("All", unique(split_values(metadata$Users, unlist=T))), selected = "All"),
      selectInput("pii", "PII", choices = c("All", unique(split_values(metadata$PII, unlist=T))), selected = "All"),
      selectInput("source", "Source", choices = c("All", unique(split_values(metadata$Source, unlist=T))), selected = "All"),
      selectInput("ftype", "File Type", choices = c("All", unique(tolower(gsub("^.+\\.(.+)$", "\\1", metadata$Connection)))), selected = "All")
    ),
    mainPanel(
      h3("Filters Applied:"),
      uiOutput("filtersSummary"),
      br(),
      h3("Filtered File Paths:"),
      uiOutput("fileList"),
      br(),
      h3("Selected File Data:"),
      DTOutput("fileDataTable"),
      uiOutput("docx_content"),
      br()
    )
  )
)

# Define the server logic
metadata_server <- function(input, output, session) {
  observe({
    output$fileList <- renderUI({
      file_paths <- filteredData()$Connection
      
      tagList(
        tags$div(
          id = "fileListContent",
          tags$ul(
            class = "file-list-ul",
            lapply(seq_along(file_paths), function(i) {
              tags$li(
                actionLink(
                  inputId = paste0("file_", i),
                  label = file_paths[i],
                  onclick = sprintf("Shiny.setInputValue('last_clicked', '%s', {priority: 'event'});", 
                                    paste0("file_", i))
                )
              )
            })
          )
        ),
        tags$button(
          id = "toggleBtn",
          onclick = "toggleFileList()",
          "▾"
        )
      )
    })
  }, priority = 1000)
  
  # Original filtering logic remains the same
  filteredData <- reactive({
    data <- metadata
    
    # Apply filters based on input values
    if (!is.null(input$product_id) && input$product_id != "") {
      data <- subset(data, grepl(input$product_id, Product_ID, ignore.case = TRUE))
    }
    if (!is.null(input$product_name) && input$product_name != "") {
      data <- subset(data, grepl(input$product_name, Product_Name, ignore.case = TRUE))
    }
    if (input$keywords != "") {
      data <- subset(data, grepl(input$keywords, Keywords, ignore.case = TRUE))
    }
    if (!is.null(input$location) && input$location != "All") {
      data <- subset(data, sapply(split_values(Location), function(.) input$location %in% .))
    }
    if (!is.null(input$steward) && input$steward != "All") {
      data <- subset(data, sapply(split_values(Steward), function(.) input$steward %in% .))
    }
    if (!is.null(input$users) && input$users != "All") {
      data <- subset(data, sapply(split_values(Users), function(.) input$users %in% .))
    }
    if (!is.null(input$pii) && input$pii != "All") {
      data <- subset(data, sapply(split_values(PII), function(.) input$pii %in% .))
    }
    if (!is.null(input$source) && input$source != "All") {
      data <- subset(data, sapply(split_values(Source), function(.) input$source %in% .))
    }
    if (!is.null(input$ftype) && input$ftype != "All") {
      data <- subset(data, grepl(paste0("\\.", input$ftype), Connection, ignore.case=TRUE))
    }
    
    return(data)
  })
  # Modified filters summary with enhanced styling
  output$filtersSummary <- renderUI({
    filters <- list()
    
    if (!is.null(input$product_id) && input$product_id != "") {
      filters$product_id <- list(label = "Product ID", value = input$product_id)
    }
    if (!is.null(input$product_name) && input$product_name != "") {
      filters$product_name <- list(label = "Product Name", value = input$product_name)
    }
    if (!is.null(input$keywords) && input$keywords != "") {
      filters$keywords <- list(label = "Keywords", value = input$keywords)
    }
    if (!is.null(input$location) && input$location != "All") {
      filters$location <- list(label = "Location", value = input$location)
    }
    if (!is.null(input$steward) && input$steward != "All") {
      filters$steward <- list(label = "Steward", value = input$steward)
    }
    if (!is.null(input$users) && input$users != "All") {
      filters$users <- list(label = "Users", value = input$users)
    }
    if (!is.null(input$pii) && input$pii != "All") {
      filters$pii <- list(label = "PII", value = input$pii)
    }
    if (!is.null(input$source) && input$source != "All") {
      filters$source <- list(label = "Source", value = input$source)
    }
    if (!is.null(input$ftype) && input$ftype != "All") {
      filters$ftype <- list(label = "File Type", value = input$ftype)
    }
    
    if (length(filters) == 0) {
      return(p("No filters applied."))
    } else {
      # Create interactive filter tags with enhanced styling
      tags$div(
        lapply(names(filters), function(filter_name) {
          tags$div(
            class = "filter-tag",
            onclick = sprintf("Shiny.setInputValue('remove_filter', '%s', {priority: 'event'});", filter_name),
            tags$span(
              class = "filter-label",
              filters[[filter_name]]$label
            ),
            tags$span(
              class = "filter-value",
              filters[[filter_name]]$value
            ),
            tags$span(
              class = "remove-icon",
              HTML("&times;")
            )
          )
        })
      )
    }
  })
  
  # Observer to handle filter removal
  observeEvent(input$remove_filter, {
    filter_name <- input$remove_filter
    if (filter_name %in% c("product_id", "product_name", "keywords")) {
      updateTextInput(session, filter_name, value = "")
    } else {
      updateSelectInput(session, filter_name, selected = "All")
    }
  })
  
  # Reactive value to store the selected file path
  selectedFile <- reactiveVal(NULL)
  
  # Replace the original observer with this new one that responds to last_clicked
  observeEvent(input$last_clicked, {
    file_index <- as.numeric(gsub("file_", "", input$last_clicked))
    file_paths <- filteredData()$Connection
    if (file_index <= length(file_paths)) {
      selectedFile(file_paths[file_index])
    }
  }, ignoreInit = TRUE)
  
  # Render the data from the selected file
  output$fileDataTable <- renderDT({
    req(selectedFile())  # Ensure there is a selected file
    if (grepl('\\.csv$', selectedFile())) {
      tryCatch({
        file_url <- paste0('https://raw.githubusercontent.com/NW-PaGe/dstt_catalog_demo/refs/heads/main/', selectedFile())
        file_data <- read.csv(url(file_url))
        # Convert dataframe to datatable object
        datatable(file_data, options = list(pageLength = 10, autoWidth = TRUE))
      }, error = function(e) {
        datatable(data.frame(Error = paste("Unable to read file:", e$message)),
                  options = list(pageLength = 1, dom = 't'))
      })
    } else if (grepl('\\.parquet$', selectedFile())) {
      tryCatch({
        # get file location url
        file_url <- paste0('https://raw.githubusercontent.com/NW-PaGe/dstt_catalog_demo/refs/heads/main/', selectedFile())
        # check file size before downloading:
        api_url <- paste0("https://api.github.com/repos/NW-PaGe/dstt_catalog_demo/contents/", selectedFile()) # Construct the GitHub API URL for the file
        con <- url(api_url, "rb") # open connection
        response <- readLines(con) # get response
        close(con) # close connection
        json_data <- jsonlite::fromJSON(paste(response, collapse = '')) # parse json response
        file_size <- as.numeric(json_data$size)/1024  # get size in KB
        # download parquet if small enough file
        if (file_size < 100000) { # check if file is under 100 KB before attempting download
          # Need to download temp file since httpfs extension not usable with Shinylive:
          temp_file <- tempfile(fileext = ".parquet") # create temp parquet file
          download.file(file_url, temp_file, mode = "wb")  # write data from file at url to temp file
          # read parquet
          con <- dbConnect(duckdb(), ':memory:') # create db in mem
          file_data <- dbGetQuery(con, glue::glue("SELECT * FROM read_parquet('{temp_file}') LIMIT 10000")) # Read data and limit preview size to 10k rows
          dbDisconnect(con, shutdown = TRUE) # close in mem db connection
          # convert dataframe to datatable object
          datatable(file_data, options = list(pageLength = 10, autoWidth = TRUE))
        } else {
          datatable(data.frame(Warning = paste("Unable to read file: File too large. File size (KB):", format(file_size, nsmall=2, big.mark=','))),
                    options = list(pageLength = 1, dom = 't'))
        }
      }, error = function(e) {
        datatable(data.frame(Error = paste("Unable to read file:", e$message)),
                  options = list(pageLength = 1, dom = 't'))
      })
    } else {
      datatable(data.frame())
    }
  })
  
  # Render docx content
  output$docx_content <- renderUI({
    req(selectedFile())  # Ensure there is a selected file
    if (grepl('\\.docx$', selectedFile())) {
      tryCatch({
        file_url <- paste0('https://raw.githubusercontent.com/NW-PaGe/dstt_catalog_demo/main/', selectedFile())
        temp_file <- tempfile(fileext = ".docx")
        download.file(file_url, destfile = temp_file, mode = "wb")
        # Read the .docx file using officer
        doc <- read_docx(temp_file)
        # Convert docx to html element using code from src
        docx_to_html(doc)
      }, error = function(e) {
        HTML(paste("Unable to read file:", e$message))
      })
    } else {
      HTML('')
    }
  })
}

# # Run the application 
# shinyApp(ui=metadata_ui, server=metadata_server)


# Create main ui
ui <- fluidPage(
  tags$style(
    HTML("
          h1.title {
            text-align: center;
            font-weight: bold;
          }
          hr.div {
            width: 100%;
            border: 5px dotted #000000;
            border-style: none none dotted;
          }
         ")
  ),
  tags$br(id='dag-br'),
  h1(id='dag', class='title', 'Directed Acyclic Graph (DAG):'),
  tags$br(),
  tags$br(),
  fluidRow(column(12, dag_ui)), # insert dag ui
  tags$hr(class='div'),
  tags$br(id='metadata-viewer-br'),
  h1(id='metadata-viewer', class='title', 'Metadata Viewer:'),
  tags$br(),
  tags$br(),
  fluidRow(column(12, metadata_ui)) # insert metadata ui 
)

# create main app
server <- function(input, output, session) {
  dag_server(input, output, session) # insert dag server logic
  metadata_server(input, output, session) # insert metadata server logic
}

shinyApp(ui, server)