require(dplyr)
require(pheatmap)
require(reshape2)
require(IncDTW)
require(dtwclust)
require(RColorBrewer)
require(caret)
require(lubridate)
For this tutorial I will use a multivariate time-series data set comprising 1000 yearly PubMed counts of drug Ingredients between 1950 - 2020:
set.seed(15651)
<- read.csv("../OB_analytics/output/pubmed_count_table_ob.csv", stringsAsFactors = FALSE)
count_table
# Randomly choose 1000 unique ingredient names and retrieve corresponding series
<- sample(unique(count_table$ing_name),1000,replace = FALSE)
ing_selected
<- count_table %>%
count_table filter(ing_name %in% ing_selected) %>%
filter(Year %in% 1950:2020) %>%
select(Year,ing_name,contains("count"))
%>%
count_table head(10)
The data is organized in long format. Each unique ingredient is linked to 4 count variables that represent the observed PubMed publication counts for each of the ingredients. Total counts represent total Pubmed counts, other series are subsets of this series, using clinical trial types as filters. Hence, the data set is considered as “multivariate time-series” even though the dimensions are clearly dependent on another.
For the purpose of univariate experiments, let’s focus on the least sparse time-series for each ingredient:
<- count_table %>%
counts_uni select(Year,ing_name,counts = pubmed_query_total_counts) %>%
dcast(Year ~ ing_name, value.var = "counts") %>%
select(-Year)
row.names(counts_uni) <- 1950:2020
colnames(counts_uni) <- make.names(colnames(counts_uni))
head(counts_uni,10)
Hence, we transformed the univariate time-series of each drug ingredient across years 1950-2020 and represented as a matrix.
Let’s look at the resulting data matrix:
# Remove all-zero counts if any
<- apply(counts_uni,2,sum) != 0
mask <- counts_uni[,mask]
counts_uni
# Min-max scale the counts
<- apply(counts_uni,2,function(x){
counts_uni_scaled
return( (x - min(x)) / (max(x) - min(x)) )
%>% data.frame()
})
pheatmap(counts_uni_scaled,
cluster_rows = F,
cluster_cols = F,
fontsize_col = 1,
fontsize_row = 8,cellheight = 7,
color = colorRampPalette(brewer.pal(n = 7, name ="Oranges"))(1000))
For many drug ingredients, there are higher counts/signal as we approach to late 2010s.
IncDTW package provides a streamlined way (written using C++) to calculate pairwise DTW similarities (‘distances’), and leverages parallel computing.
Two functions are particularly noteworthy to mention here:
lot
argument)lot
argument) all relative to one query time series (Q
argument)Here, I will utilize dtw_dismat to conveniently generate a pairwise similarity matrix using the sample data, then use hierarchical clustering to visualize similarities between the time series:
<- dtw_dismat(lot = c(counts_uni_scaled), # convert scaled univariate time-series to a list
distmatrix dist_method = "norm2", # using 2-norm as the base-distance for DTW
step_pattern = "symmetric2",
normalize = FALSE, # Returns normalized pairwise distances if TRUE, important when series are in different lengths, here we have all series in the same length
ws = NULL, # describes the window size for the sakoe chiba window, in this case we don't set.
threshold = NULL, # In this case we don't use early abandoning, but can be performed using this parameter
ncores = 7, # NUmber of cores to be used in parallel computing (uses parallel package by default)
return_matrix = TRUE # Returns a symmetric pairwise distance matrix
)
# Use log of the distances to examine clusters
<- log1p(distmatrix$dismat)
log_dist_matrix pheatmap(log_dist_matrix,
cluster_rows = T,
cluster_cols = T,
fontsize_row = 0.1,
fontsize_col = 0.1,
clustering_method = "complete",
cutree_rows = 4,
cutree_cols = 4,
fontsize = 12)
Overall we notice at least 2 distinct clusters that are pushed towards the opposing sides. In general, we see more dissimilarity (higher pairwise dtw values) across the series.
First extract some of these clusters:
<- dtw_dismat(lot = c(counts_uni_scaled), # convert scaled univariate time-series to a list
distmatrix dist_method = "norm2_square", # using 2-norm squared as the base-distance for DTW
step_pattern = "symmetric2",
normalize = FALSE, # Returns normalized pairwise distances if TRUE, important when series are in different lengths, here we have all series in the same length
ws = NULL, # describes the window size for the sakoe chiba window, in this case we don't set.
threshold = NULL, # In this case we don't use early abandoning, but can be performed using this parameter
ncores = 7, # NUmber of cores to be used in parallel computing (uses parallel package by default)
return_matrix = FALSE # Returns a dissimilarity object
)
# Use log of the distances to examine clusters
<- log1p(distmatrix$dismat)
log_dist_matrix
<- hclust(log_dist_matrix, method = "complete")
clusters
<- sort(clusters$height, decreasing = TRUE)[15] # We will mark the cut point for the highest level 15 clusters
cut_height
plot(clusters,labels = F, col = "navy")
abline(h = cut_height, col = "red", lty = 2)
Next, we focus on the 15 clusters that are highest in the hierarchy by cutting the dendrogram and obtain cluster memberships of each series:
<- cutree(clusters, h = cut_height)
cluster_membership
1:5] cluster_membership[
## ABACAVIR.SULFATE ABACAVIR.SULFATE..LAMIVUDINE
## 1 1
## ABARELIX ACALABRUTINIB
## 2 1
## ACAMPROSATE.CALCIUM
## 3
unique(cluster_membership)
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
As we can notice, we selected the top 15 clusters (since we sorted dendrogram heights before cutting the tree), the resulting named vector represents cluster membership, which we can use to plot series across groups.
At the same time, we will use dba()
to calculate Barycenters (centroids or average series obtained using DTW) for a given cluster and plot in conjunction.
Let’s plot for Cluster 1:
# Timeline of the plots (i.e: x-axis)
<- as.numeric(row.names(counts_uni_scaled))
timeline
= 1 # cluster index
i
# identify the columns that represent the series in the given cluster
<- colnames(counts_uni_scaled) %in% names(cluster_membership[cluster_membership == i])
columns_selected
# List of series in the given cluster
<- c(counts_uni_scaled[,columns_selected])
series_in_cluster
# Calculate Barycenter Centroid for the given cluster
<- IncDTW::dba(lot = series_in_cluster,
bc dist_method = "norm2",
step_pattern = "symmetric2"
$m1 # m1 is the series that correspond to Barycenter
)
# plot each series in the cluster
plot(x = timeline, y = rep(1.1, length(timeline)), type = "n", ylab = "signal", ylim = c(0,1.1),
main = paste0("Cluster ",i))
for (s in 1:length(series_in_cluster)){
lines(x = timeline, y = series_in_cluster[[s]], col = scales::alpha("grey",0.5))
}# Add Barycenter
lines(x = timeline, y = bc, col = "red", lwd = 2)
Let’s plot all 15 clusters as a 5 x 3 plot:
# Timeline of the plots (i.e: x-axis)
<- as.numeric(row.names(counts_uni_scaled))
timeline
par(mfrow = c(5,3))
for ( i in 1:length(unique(cluster_membership))){
# identify the columns that represent the series in the given cluster
<- colnames(counts_uni_scaled) %in% names(cluster_membership[cluster_membership == i])
columns_selected
# List of series in the given cluster
<- c(counts_uni_scaled[,columns_selected])
series_in_cluster
# Calculate Barycenter Centroid for the given cluster
<- IncDTW::dba(lot = series_in_cluster,
bc dist_method = "norm2",
step_pattern = "symmetric2"
$m1 # m1 is the series that correspond to Barycenter
)
# plot each series in the cluster
plot(x = timeline, y = rep(1.1, length(timeline)), type = "n", ylab = "signal", ylim = c(0,1.1),
main = paste0("Cluster ",i))
for (s in 1:length(series_in_cluster)){
lines(x = timeline, y = series_in_cluster[[s]], col = scales::alpha("grey",0.5))
}# Add Barycenter
lines(x = timeline, y = bc, col = "red", lwd = 2)
}
This visualization illustrates a few remarkable points:
Hence, let’s focus on the data beyond 1980 to see the impact on clusters:
set.seed(5465)
<- count_table %>%
counts_uni select(Year,ing_name,counts = pubmed_query_clinical_trial_counts) %>%
dcast(Year ~ ing_name, value.var = "counts") %>%
select(-Year)
row.names(counts_uni) <- 1950:2020
colnames(counts_uni) <- make.names(colnames(counts_uni))
# Remove all-zero counts if any
<- apply(counts_uni,2,sum) != 0
mask <- counts_uni[,mask]
counts_uni
# Min-max scale the counts
<- apply(counts_uni,2,function(x){
counts_uni_scaled
return( (x - min(x)) / (max(x) - min(x)) )
%>% data.frame()
})
<- as.numeric(row.names(counts_uni_scaled)) >= 1980 # Focusing on data observed 1980 and beyond
mask
<- dtw_dismat(lot = c(counts_uni_scaled[mask,]), # convert scaled univariate time-series to a list
distmatrix dist_method = "norm2_square", # using 2-norm squared as the base-distance for DTW
step_pattern = "symmetric2",
normalize = FALSE, # Returns normalized pairwise distances if TRUE, important when series are in different lengths, here we have all series in the same length
ws = NULL, # describes the window size for the sakoe chiba window, in this case we don't set.
threshold = NULL, # In this case we don't use early abandoning, but can be performed using this parameter
ncores = 7, # NUmber of cores to be used in parallel computing (uses parallel package by default)
return_matrix = FALSE # Returns a dissimilarity object
)
# Use log of the distances to examine clusters
<- log1p(distmatrix$dismat)
log_dist_matrix
<- hclust(log_dist_matrix, method = "complete")
clusters <- sort(clusters$height, decreasing = TRUE)[15] # We will mark the cut point for the highest level 15 clusters
cut_height <- cutree(clusters, h = cut_height)
cluster_membership
# Timeline of the plots (i.e: x-axis)
<- as.numeric(row.names(counts_uni_scaled[mask,]))
timeline
= list() # A spare list to hold series in each cluster along with barycenters
cluster_series_with_bc
par(mfrow = c(5,3))
for ( i in 1:length(unique(cluster_membership))){
# identify the columns that represent the series in the given cluster
<- colnames(counts_uni_scaled) %in% names(cluster_membership[cluster_membership == i])
columns_selected
# List of series in the given cluster
<- c(counts_uni_scaled[mask,columns_selected])
series_in_cluster
# Calculate Barycenter Centroid for the given cluster
<- IncDTW::dba(lot = series_in_cluster,
bc dist_method = "norm2",
step_pattern = "symmetric2"
$m1 # m1 is the series that correspond to Barycenter
)
# plot each series in the cluster
plot(x = timeline, y = rep(1.1, length(timeline)), type = "n", ylab = "signal", ylim = c(0,1.1),
main = paste0("Cluster ",i))
for (s in 1:length(series_in_cluster)){
lines(x = timeline, y = series_in_cluster[[s]], col = scales::alpha("grey",0.5))
}# Add Barycenter
lines(x = timeline, y = bc, col = "red", lwd = 2)
<- list(series_in_cluster,bc = bc[,1])
series_in_cluster
<- series_in_cluster
cluster_series_with_bc[[i]]
}
In fact, restricting data beyond 1980 seems to form clusters that are tighter around the centroids (particularly see clusters 5,13,14, and 15), while we still see many noisy clusters.
Next, let’s visualize the original series (scaled) that are ordered to these 15 clusters in a heatmap:
# For each series in a given cluster, calculate their distance to their respective barycenter,
# order based on this distance
# Select series from counts_uni_scaled in the selected order, then cbind
<- list()
series_distance_to_bc <- list()
series_matrix
for (i in 1:length(cluster_series_with_bc)){
<- cluster_series_with_bc[[i]]
temp <- temp$bc
query <- within(temp,rm(bc))[[1]]
lot <- dtw_disvec(Q = query, lot = lot,
distvector dist_method = "norm2_square", # using 2-norm squared as the base-distance for DTW
step_pattern = "symmetric2",
normalize = FALSE, # Returns normalized pairwise distances if TRUE, important when series are in different lengths, here we have all series in the same length
ws = NULL, # describes the window size for the sakoe chiba window, in this case we don't set.
threshold = NULL, # In this case we don't use early abandoning, but can be performed using this parameter
ncores = 7 # NUmber of cores to be used in parallel computing (uses parallel package by default)
$disvec
)
<- distvector[order(distvector)]
distvector
<- distvector
series_distance_to_bc[[i]]
<- counts_uni_scaled[names(distvector)]
series_matrix[[i]]
}
names(series_distance_to_bc) <- paste0("cluster_",1:length(cluster_series_with_bc))
# Prepare the ordered series matrix based on clusters
<- do.call(cbind,series_matrix)
series_matrix <- as.numeric(row.names(counts_uni_scaled)) >= 1980 # Focusing on data observed 1980 and beyond
mask <- series_matrix[mask,]
series_matrix
# Only label rows once in 5 years to avoid clutter in the plot
<- as.numeric(row.names(series_matrix)) %% 5 != 0
mask <- ifelse(mask,"",row.names(series_matrix))
labels_row
######################################################
# Add annotations for drugs to identify clusters
= NULL
Cluster = NULL
Ing.names for (k in 1:length(series_distance_to_bc)){
<- c(Cluster,rep(names(series_distance_to_bc)[k], length(series_distance_to_bc[[k]])))
Cluster <- c(Ing.names,names(series_distance_to_bc[[k]]))
Ing.names
}
= data.frame(
annotation_col Cluster = Cluster
)row.names(annotation_col) <- Ing.names
<- c(brewer.pal(n = 12, name ="Paired"),brewer.pal(n = 3, name ="Dark2"))
x names(x) <- names(series_distance_to_bc)
<- list(Cluster = x)
annt_colors ######################################################
pheatmap(series_matrix,
cluster_rows = F,
cluster_cols = F,
fontsize_col = 1, annotation_colors = annt_colors, labels_row = labels_row,
fontsize_row = 8,cellheight = 7, annotation_col = annotation_col,
color = colorRampPalette(brewer.pal(n = 7, name ="Oranges"))(1000))
I note the patterns are more structured across many clusters, while I still observe many noisy clusters without a clear pattern. This is particularly the case for sparse time-series. Hence, next question becomes whether we can get a better clustering if we filter series with “little or no variance”.
set.seed(5465)
<- count_table %>%
counts_uni select(Year,ing_name,counts = pubmed_query_clinical_trial_counts) %>%
dcast(Year ~ ing_name, value.var = "counts") %>%
select(-Year)
row.names(counts_uni) <- 1950:2020
colnames(counts_uni) <- make.names(colnames(counts_uni))
# Remove all-zero counts if any
<- apply(counts_uni,2,sum) != 0
mask <- counts_uni[,mask]
counts_uni
<- as.numeric(row.names(counts_uni)) >= 1980 # Focusing on data observed 1980 and beyond
mask <- counts_uni[mask,]
counts_uni
###############################
# Near zero variance filter
###############################
<- nearZeroVar(counts_uni)
mask <- counts_uni[,-mask]
counts_uni
# Min-max scale the counts
<- apply(counts_uni,2,function(x){
counts_uni_scaled
return( (x - min(x)) / (max(x) - min(x)) )
%>% data.frame()
})
<- dtw_dismat(lot = c(counts_uni_scaled), # convert scaled univariate time-series to a list
distmatrix dist_method = "norm2_square", # using 2-norm squared as the base-distance for DTW
step_pattern = "symmetric2",
normalize = FALSE, # Returns normalized pairwise distances if TRUE, important when series are in different lengths, here we have all series in the same length
ws = NULL, # describes the window size for the sakoe chiba window, in this case we don't set.
threshold = NULL, # In this case we don't use early abandoning, but can be performed using this parameter
ncores = 7, # NUmber of cores to be used in parallel computing (uses parallel package by default)
return_matrix = FALSE # Returns a dissimilarity object
)
# Use log of the distances to examine clusters
<- log1p(distmatrix$dismat)
log_dist_matrix
<- hclust(log_dist_matrix, method = "complete")
clusters <- sort(clusters$height, decreasing = TRUE)[15] # We will mark the cut point for the highest level 15 clusters
cut_height <- cutree(clusters, h = cut_height)
cluster_membership
# Timeline of the plots (i.e: x-axis)
<- as.numeric(row.names(counts_uni_scaled))
timeline
= list() # A spare list to hold series in each cluster along with barycenters
cluster_series_with_bc
par(mfrow = c(5,3))
for ( i in 1:length(unique(cluster_membership))){
# identify the columns that represent the series in the given cluster
<- colnames(counts_uni_scaled) %in% names(cluster_membership[cluster_membership == i])
columns_selected
# List of series in the given cluster
<- c(counts_uni_scaled[,columns_selected])
series_in_cluster
# Calculate Barycenter Centroid for the given cluster
<- IncDTW::dba(lot = series_in_cluster,
bc dist_method = "norm2",
step_pattern = "symmetric2"
$m1 # m1 is the series that correspond to Barycenter
)
# plot each series in the cluster
plot(x = timeline, y = rep(1.1, length(timeline)), type = "n", ylab = "signal", ylim = c(0,1.1),
main = paste0("Cluster ",i))
for (s in 1:length(series_in_cluster)){
lines(x = timeline, y = series_in_cluster[[s]], col = scales::alpha("grey",0.5))
}# Add Barycenter
lines(x = timeline, y = bc, col = "red", lwd = 2)
<- list(series_in_cluster,bc = bc[,1])
series_in_cluster
<- series_in_cluster
cluster_series_with_bc[[i]]
}
Finally, let’s also visualize the original series (scaled) that are ordered to these 15 clusters in a heatmap:
# For each series in a given cluster, calculate their distance to their respective barycenter,
# order based on this distance
# Select series from counts_uni_scaled in the selected order, then cbind
<- list()
series_distance_to_bc <- list()
series_matrix
for (i in 1:length(cluster_series_with_bc)){
<- cluster_series_with_bc[[i]]
temp <- temp$bc
query <- within(temp,rm(bc))[[1]]
lot <- dtw_disvec(Q = query, lot = lot,
distvector dist_method = "norm2_square", # using 2-norm squared as the base-distance for DTW
step_pattern = "symmetric2",
normalize = FALSE, # Returns normalized pairwise distances if TRUE, important when series are in different lengths, here we have all series in the same length
ws = NULL, # describes the window size for the sakoe chiba window, in this case we don't set.
threshold = NULL, # In this case we don't use early abandoning, but can be performed using this parameter
ncores = 7 # NUmber of cores to be used in parallel computing (uses parallel package by default)
$disvec
)
<- distvector[order(distvector)]
distvector
<- distvector
series_distance_to_bc[[i]]
<- counts_uni_scaled[names(distvector)]
series_matrix[[i]]
}
names(series_distance_to_bc) <- paste0("cluster_",1:length(cluster_series_with_bc))
# Prepare the ordered series matrix based on clusters
<- do.call(cbind,series_matrix)
series_matrix <- as.numeric(row.names(counts_uni_scaled)) >= 1980 # Focusing on data observed 1980 and beyond
mask <- series_matrix[mask,]
series_matrix
# Only label rows once in 5 years to avoid clutter in the plot
<- as.numeric(row.names(series_matrix)) %% 5 != 0
mask <- ifelse(mask,"",row.names(series_matrix))
labels_row
######################################################
# Add annotations for drugs to identify clusters
= NULL
Cluster = NULL
Ing.names for (k in 1:length(series_distance_to_bc)){
<- c(Cluster,rep(names(series_distance_to_bc)[k], length(series_distance_to_bc[[k]])))
Cluster <- c(Ing.names,names(series_distance_to_bc[[k]]))
Ing.names
}
= data.frame(
annotation_col Cluster = Cluster
)row.names(annotation_col) <- Ing.names
<- c(brewer.pal(n = 12, name ="Paired"),brewer.pal(n = 3, name ="Dark2"))
x names(x) <- names(series_distance_to_bc)
<- list(Cluster = x)
annt_colors ######################################################
pheatmap(series_matrix,
cluster_rows = F,
cluster_cols = F,
fontsize_col = 1, annotation_colors = annt_colors, labels_row = labels_row,
fontsize_row = 8,cellheight = 7, annotation_col = annotation_col,
color = colorRampPalette(brewer.pal(n = 7, name ="Oranges"))(1000))
Perhaps some of the clusters look further consolidated, still there are several fuzzy clusters visible.
As a next step for hierarchical clustering experiment, let’s check if any of the clusters may be enriched with any facts about these drug ingredients. Let’s go back to Orange Book and extract some summary metrics at ingredient level and try to see if anything stands out at cluster-level.
<- readRDS("./input/ob_table.rds")
ob_table
<- ob_table %>%
ob_table_summary select(Ingredient,Appl_Type,Appl_No,Type) %>%
distinct() %>%
mutate(
Appl_Type_N = ifelse(Appl_Type == "N",1,0),
Appl_Type_A = ifelse(Appl_Type == "A",1,0),
Type_DISCN = ifelse(Type == "DISCN",1,0),
Type_OTC = ifelse(Type == "OTC",1,0),
Type_RX = ifelse(Type == "RX",1,0)
%>%
) select(-Appl_Type, -Type) %>%
group_by(Ingredient) %>% # aggregate sums over Ingredients
summarise(
num_unique_applications = length(unique(Appl_No)),
total_Appl_Type_N = sum(Appl_Type_N),
total_Appl_Type_A = sum(Appl_Type_A),
total_Type_DISCN = sum(Type_DISCN),
total_Type_OTC = sum(Type_OTC),
total_Type_RX = sum(Type_RX),
%>%
) mutate(Ingredient = make.names(Ingredient)) %>%
filter(Ingredient %in% names(cluster_membership)) %>%
mutate(cluster_membership = as.factor(cluster_membership[Ingredient])) %>%
select(-Ingredient) %>%
data.frame() %>%
melt(id.var = "cluster_membership")
ggplot(ob_table_summary, aes(x = cluster_membership, y = value))+
geom_boxplot(fill = "navy")+
facet_wrap(. ~ variable, scales = "free")+
xlab("Cluster Membership")
The results don’t imply a particular cluster that stands out with respect to these summary metrics at the ingredient level. Note that clusters 13,14 and 15 appear slightly higher, but this could be also due to the fact that these are relatively small clusters.
Interestingly, cluster 4 seem to have ingredients that have slightly higher unique applications (on average), and this cluster seems to reflect PubMed time-series profiles that were high in early 1980, and gradually getting lower.
Next, let’s investigate how the time-series profile of application approval counts for a given cluster reflects relative to the barycenter.
set.seed(5465)
<- count_table %>%
counts_uni select(Year,ing_name,counts = pubmed_query_clinical_trial_counts) %>%
dcast(Year ~ ing_name, value.var = "counts") %>%
select(-Year)
row.names(counts_uni) <- 1950:2020
colnames(counts_uni) <- make.names(colnames(counts_uni))
# Remove all-zero counts if any
<- apply(counts_uni,2,sum) != 0
mask <- counts_uni[,mask]
counts_uni
<- as.numeric(row.names(counts_uni)) >= 1980 # Focusing on data observed 1980 and beyond
mask <- counts_uni[mask,]
counts_uni
###############################
# Near zero variance filter
###############################
<- nearZeroVar(counts_uni)
mask <- counts_uni[,-mask]
counts_uni
# Min-max scale the counts
<- apply(counts_uni,2,function(x){
counts_uni_scaled
return( (x - min(x)) / (max(x) - min(x)) )
%>% data.frame()
})
<- dtw_dismat(lot = c(counts_uni_scaled), # convert scaled univariate time-series to a list
distmatrix dist_method = "norm2_square", # using 2-norm squared as the base-distance for DTW
step_pattern = "symmetric2",
normalize = FALSE, # Returns normalized pairwise distances if TRUE, important when series are in different lengths, here we have all series in the same length
ws = NULL, # describes the window size for the sakoe chiba window, in this case we don't set.
threshold = NULL, # In this case we don't use early abandoning, but can be performed using this parameter
ncores = 7, # NUmber of cores to be used in parallel computing (uses parallel package by default)
return_matrix = FALSE # Returns a dissimilarity object
)
# Use log of the distances to examine clusters
<- log1p(distmatrix$dismat)
log_dist_matrix
<- hclust(log_dist_matrix, method = "complete")
clusters <- sort(clusters$height, decreasing = TRUE)[15] # We will mark the cut point for the highest level 15 clusters
cut_height <- cutree(clusters, h = cut_height)
cluster_membership
# Timeline of the plots (i.e: x-axis)
<- as.numeric(row.names(counts_uni_scaled))
timeline
= list() # A spare list to hold series in each cluster along with barycenters
cluster_series_with_bc
########################################################
# Prepare Application summary metric time-series from OB
########################################################
<- readRDS("./input/ob_table.rds")
ob_table
<- ob_table %>%
ob_table_summary select(Ingredient,Appl_Type,Appl_No,Approval_Date) %>%
distinct() %>%
filter(Approval_Date != 'Approved Prior to Jan 1, 1982') %>%
mutate(Approval_Date = mdy(Approval_Date)) %>%
mutate(approval_year = year(Approval_Date)) %>%
mutate(Ingredient = make.names(Ingredient)) %>%
filter(Ingredient %in% names(cluster_membership)) %>%
mutate(cluster_membership = as.numeric(cluster_membership[Ingredient])) %>%
group_by(cluster_membership,approval_year) %>%
summarise(total_num_approvals = n())
<- expand.grid(approval_year = 1980:2020,
approval_table cluster_membership = 1:15,stringsAsFactors = FALSE) %>%
left_join(ob_table_summary)%>%
mutate(total_num_approvals = ifelse(is.na(total_num_approvals),0,total_num_approvals)) %>%
dcast(approval_year ~ cluster_membership)
row.names(approval_table) <- approval_table$approval_year
<- approval_table %>% select(-approval_year)
approval_table
<- apply(approval_table,2, function(x){
approval_table
return( (x - min(x)) / (max(x) - min(x)) )
})
par(mfrow = c(5,3))
for ( i in 1:length(unique(cluster_membership))){
# identify the columns that represent the series in the given cluster
<- colnames(counts_uni_scaled) %in% names(cluster_membership[cluster_membership == i])
columns_selected
# List of series in the given cluster
<- c(counts_uni_scaled[,columns_selected])
series_in_cluster
# Calculate Barycenter Centroid for the given cluster
<- IncDTW::dba(lot = series_in_cluster,
bc dist_method = "norm2",
step_pattern = "symmetric2"
$m1 # m1 is the series that correspond to Barycenter
)
# plot each series in the cluster
plot(x = timeline, y = rep(1.1, length(timeline)), type = "n", ylab = "signal", ylim = c(0,1.1),
main = paste0("Cluster ",i))
for (s in 1:length(series_in_cluster)){
lines(x = timeline, y = series_in_cluster[[s]], col = scales::alpha("grey",0.5))
}# Add Barycenter
lines(x = timeline, y = bc, col = "red", lwd = 2)
<- list(series_in_cluster,bc = bc[,1])
series_in_cluster
<- series_in_cluster
cluster_series_with_bc[[i]]
# Add total approvals per year
lines(x = timeline, y = approval_table[,i], col = "navy", lwd = 2)
legend("topleft", legend = c("Approvals","Cluster barycenter"), lwd = 2,
col = c("navy","red"))
}
This observation is particularly interesting, because we can notice that the total approvals signal over-time closely follow the barycenter of the respective clusters in several cases. This would imply that the underlying structure we unveiled by clustering clinical trial related publication counts may correlate with total FDA approvals for the ingredients within that cluster. For certain clusters (for instance cluster 1 and 9) the barycenter appears to be slightly lagging the approvals signal, which may signify potential predictive/forecasting power of certain clusters. These observations are worth for further investigating.
In this section, I will employ K-shape algorithm described in:
“Paparrizos J and Gravano L (2015). “k-Shape: Efficient and Accurate Clustering of Time Series.” In Proceedings of the 2015 ACM SIGMOD International Conference on Management of Data, series SIGMOD ’15, pp. 1855-1870. ISBN 978-1-4503-2758-9, http://www1.cs.columbia.edu/~jopa/Papers/PaparrizosSIGMOD2015.pdf . 2737793."
I will use the API of the dtwclust
package in R, which provides a nice gateway for various time-series clustering algorithms, including the K-shape.
First, let’s prepare the time-series data as before:
set.seed(15651)
<- read.csv("../OB_analytics/output/pubmed_count_table_ob.csv", stringsAsFactors = FALSE)
count_table
# Randomly choose 1000 unique ingredient names and retrieve corresponding series
<- sample(unique(count_table$ing_name),1000,replace = FALSE)
ing_selected
<- count_table %>%
count_table filter(ing_name %in% ing_selected) %>%
filter(Year %in% 1980:2020) %>%
select(Year,ing_name,contains("count"))
<- count_table %>%
counts_uni select(Year,ing_name,counts = pubmed_query_clinical_trial_counts) %>%
dcast(Year ~ ing_name, value.var = "counts") %>%
select(-Year)
row.names(counts_uni) <- 1980:2020
colnames(counts_uni) <- make.names(colnames(counts_uni))
###############################
# Near zero variance filter
###############################
<- nearZeroVar(counts_uni)
mask <- counts_uni[,-mask]
counts_uni
head(counts_uni,10)
tsclust
is the main gateway function in the dtwclust
package. Basically, setting the following arguments provide an implementation of the k-shape algorithm:
type
= “partitional”, preproc
= zscore, distance
= “sbd” and centroid
= “shape”Note that in this case I don’t scale the time-series, as k-shape algorithm uses z-scores as the preprocessing function
<- c(counts_uni) # Prepare list of time-series corresponding to each sample
lot
<- tsclust(series = lot,
clustering type = "partitional", # Type of clustering method to use
k = 10, # Number of clusters
preproc = "zscore", # preprocessing function to use
distance = "sbd", # distance metric to use
centroid = "shape", # function to calculate centroids
seed = 48615 # For reproducibility
)
The resulting object is a S4 class of: TSClusters
, containing a lot of metadata along with clustering information.
Let’s look at some of these components to further understand the resulting object:
@cluster
: Integer vector indicating which cluster a series belongs to (crisp partition)Hence, this vector stores the resulting cluster membership for each of the series use in model training:
@cluster[1:20] clustering
## [1] 5 5 6 10 5 8 7 10 9 6 4 10 5 8 6 10 1 5 10 10
length(clustering@cluster)
## [1] 823
length(lot)
## [1] 823
@centroids
: A list with the centroid time seriesSimilar to barycenter centroids, time-series corresponding to centroids of the k-shape clustering. Note that the centroid series are provided as z-scores as k-shape algorithm works at the z-score space.
@centroids[[1]] clustering
## [1] -1.31949219 -1.31439246 -1.27014945 -1.10159295 -1.07958517 -1.06835473
## [7] -0.89733292 -0.47711711 -0.45508149 -0.05626825 0.22826787 0.62390646
## [13] 0.47737021 1.03649080 1.78640993 2.06371477 2.29257335 1.72198857
## [19] 1.79801481 1.11347199 1.36171854 0.41920867 0.58510671 0.78077502
## [25] 0.28131587 0.20684917 0.09531864 -0.15662534 -0.23815985 -0.42599606
## [31] -0.29906702 -0.35444514 -0.43916636 -0.55407056 -0.67331040 -0.65489941
## [37] -0.77078459 -0.71979392 -0.81483471 -0.77100573 -0.96097559
# Centroids for k=10 clusters
length(clustering@centroids)
## [1] 10
@clusinfo
: A data frame with two columns: size indicates the number of series each cluster has, and av_dist indicates, for each cluster, the average distance between series and their respective centroids (crisp partition).This could be a useful output to explore optimal number of clusters when training the model.
print(clustering@clusinfo)
## size av_dist
## 1 56 0.2307909
## 2 15 0.3162775
## 3 74 0.2551236
## 4 55 0.2984268
## 5 149 0.1518849
## 6 177 0.1877729
## 7 68 0.2099054
## 8 83 0.2110363
## 9 76 0.3467991
## 10 70 0.2987365
@cldist
: A column vector with the distance between each series in the data and its corresponding centroid (crisp partition).This could be another useful output to explore optimal number of clusters when training the model, when used along with the cluster membership metadata (i.e: @cluster
)
head(clustering@cldist)
## [,1]
## [1,] 0.10974664
## [2,] 0.09120469
## [3,] 0.25328103
## [4,] 0.10719442
## [5,] 0.14069971
## [6,] 0.17389584
nrow(clustering@cldist)
## [1] 823
Let’s visualize k-shape clustering results along with the resulting centroids.
<- clustering@cluster
cluster_membership
# Timeline of the plots (i.e: x-axis)
<- as.numeric(row.names(counts_uni))
timeline
= list() # A spare list to hold series in each cluster along with centroids
cluster_series_with_centroids <- data.frame(clustering@datalist) # z-scored series used for clustering
series_zscored
par(mfrow = c(5,2))
for ( i in 1:length(unique(cluster_membership))){
# identify the columns that represent the series in the given cluster
<- colnames(series_zscored)[cluster_membership == i]
columns_selected
# List of series in the given cluster
<- series_zscored[,columns_selected]
temp <- c(temp)
series_in_cluster
# Get the Centroid for the given cluster
<- clustering@centroids[[i]]
cent
# plot each series in the cluster
plot(x = timeline, y = rep(1.1, length(timeline)), type = "n", ylab = "signal", ylim = c(min(temp),max(temp)),
main = paste0("Cluster ",i))
for (s in 1:length(series_in_cluster)){
lines(x = timeline, y = series_in_cluster[[s]], col = scales::alpha("grey",0.5))
}# Add Centroid
lines(x = timeline, y = cent, col = "red", lwd = 2)
<- list(series_in_cluster,centroid = cent)
series_in_cluster
<- series_in_cluster
cluster_series_with_centroids[[i]]
}
Using k-shape clustering, it seems like we are getting generally tighter clusters compared to clusters we obtained by using hierarchical clustering. Next, let’s try to optimize the k, number of clusters, to see which k we could choose for the given data set.
Choosing the number of clusters is ultimately a trade off between overfitting to the training set and having very few, hence underfitting model. If we choose too many clusters we can get very tight clusters but the resulting model may not be generalizeable and would be harder to explain due to large number of clusters. On the other hand, if we choose too few clusters, then clusters would have more heterogeneity, hence they would be more noisy, less likely to contain a special pattern.
In order to facilitate selection of k (number of clusters), I will follow two metrics observed for a given clustering attempt:
I will also monitor the time it takes to train the model as another dimension for the comparison.
# NUmber of clusters that will be tested in the experiment
= c(2:15,18,20,22,25,30,40,50)
K.values
<- data.frame(K.values = K.values,
results total_distance = 0,
average_distance = 0 ,
compute_time = 0,
stringsAsFactors = FALSE)
# Test for each K.value
for (i in 1:length(K.values)){
<- K.values[i]
K
# Clustering model
<- tsclust(series = lot,
clustering type = "partitional", # Type of clustering method to use
k = K, # Number of clusters
preproc = "zscore", # preprocessing function to use
distance = "sbd", # distance metric to use
centroid = "shape", # function to calculate centroids
seed = 48615 # For reproducibility
)
<- mean(clustering@clusinfo$av_dist)
average_distance <- data.frame(distance = clustering@cldist[,1],
total_distance cluster = clustering@cluster) %>%
group_by(cluster)%>%
summarize(total = sum(distance))
<- mean(total_distance$total)
total_distance <- clustering@proctime[1]
compute_time
$total_distance[i] <- total_distance
results$average_distance[i] <- average_distance
results$compute_time[i] <- compute_time
results
}
Next, lets visualize all these 3 dimensions we collected about the experiment:
<- melt(results, id.vars = "K.values")
results_summary
ggplot(results_summary, aes(x = K.values , y = value))+
geom_line(color = "navy")+
geom_point(color= "navy")+
facet_grid(variable ~ ., scales = "free")
It looks like for the given data set k = 15 is a good trade-off. Both total and average distance decreases fairly steadily up to this point, and increasing model complexity beyond k = 15 gives only marginal reduction with respect to these metrics. At the same time computing time is relatively similar in the range of k = 12 to k = 30.
Let’s retrain the model with the selected number of clusters and visualize the resulting clusters:
<- c(counts_uni) # Prepare list of time-series corresponding to each sample
lot
<- tsclust(series = lot,
clustering type = "partitional", # Type of clustering method to use
k = 15, # Number of clusters
preproc = "zscore", # preprocessing function to use
distance = "sbd", # distance metric to use
centroid = "shape", # function to calculate centroids
seed = 48615 # For reproducibility
)
<- clustering@cluster
cluster_membership
# Timeline of the plots (i.e: x-axis)
<- as.numeric(row.names(counts_uni))
timeline
= list() # A spare list to hold series in each cluster along with centroids
cluster_series_with_centroids <- data.frame(clustering@datalist) # z-scored series used for clustering
series_zscored
par(mfrow = c(5,3))
for ( i in 1:length(unique(cluster_membership))){
# identify the columns that represent the series in the given cluster
<- colnames(series_zscored)[cluster_membership == i]
columns_selected
# List of series in the given cluster
<- series_zscored[,columns_selected]
temp <- c(temp)
series_in_cluster
# Get the Centroid for the given cluster
<- clustering@centroids[[i]]
cent
# plot each series in the cluster
plot(x = timeline, y = rep(1.1, length(timeline)), type = "n", ylab = "signal", ylim = c(min(temp),max(temp)),
main = paste0("Cluster ",i))
for (s in 1:length(series_in_cluster)){
lines(x = timeline, y = series_in_cluster[[s]], col = scales::alpha("grey",0.5))
}# Add Centroid
lines(x = timeline, y = cent, col = "red", lwd = 2)
<- list(series_in_cluster,centroid = cent)
series_in_cluster
<- series_in_cluster
cluster_series_with_centroids[[i]]
}
Many of the clusters we obtained look remarkably tighter relative to the hierarchical clustering.
Let’s also visualize the original series (scaled) that are ordered to these 15 clusters in a heatmap:
# For each series in a given cluster, use their distance to their respective centroid,
# order based on this distance
# Select series from series_zcored in the selected order, then cbind
<- list()
series_matrix <- list()
series_distance_to_bc
for (i in 1:length(unique(cluster_membership))){
<- series_zscored[cluster_membership == i]
temp
<- clustering@cldist[,1][cluster_membership == i]
distvector names(distvector) <- colnames(temp)
<- distvector[order(distvector)]
distvector <- distvector
series_distance_to_bc[[i]]
<- temp[names(distvector)]
series_matrix[[i]]
}
names(series_distance_to_bc) <- paste0("cluster_",1:length(unique(cluster_membership)))
# Prepare the ordered series matrix based on clusters
<- do.call(cbind,series_matrix)
series_matrix
# Only label rows once in 5 years to avoid clutter in the plot
<- as.numeric(row.names(counts_uni)) %% 5 != 0
mask <- ifelse(mask,"",row.names(counts_uni))
labels_row
######################################################
# Add annotations for drugs to identify clusters
= NULL
Cluster = NULL
Ing.names for (k in 1:length(series_distance_to_bc)){
<- c(Cluster,rep(names(series_distance_to_bc)[k], length(series_distance_to_bc[[k]])))
Cluster <- c(Ing.names,names(series_distance_to_bc[[k]]))
Ing.names
}
= data.frame(
annotation_col Cluster = Cluster
)row.names(annotation_col) <- Ing.names
<- c(brewer.pal(n = 12, name ="Paired"),brewer.pal(n = 3, name ="Dark2"))
x names(x) <- names(series_distance_to_bc)
<- list(Cluster = x)
annt_colors ######################################################
pheatmap(series_matrix,
cluster_rows = F,
cluster_cols = F,
fontsize_col = 1, annotation_colors = annt_colors, labels_row = labels_row,
fontsize_row = 8,cellheight = 7, annotation_col = annotation_col,
color = colorRampPalette(rev(brewer.pal(n = 10, name ="RdBu")))(100))
Just like we explored for the hieararchical clustering, let’s also explore how the drug approval signal aligns with the clusters we have:
########################################################
# Prepare Application summary metric time-series from OB
########################################################
names(cluster_membership) <- colnames(series_zscored)
<- readRDS("./input/ob_table.rds")
ob_table
<- ob_table %>%
ob_table_summary select(Ingredient,Appl_Type,Appl_No,Approval_Date) %>%
distinct() %>%
filter(Approval_Date != 'Approved Prior to Jan 1, 1982') %>%
mutate(Approval_Date = mdy(Approval_Date)) %>%
mutate(approval_year = year(Approval_Date)) %>%
mutate(Ingredient = make.names(Ingredient)) %>%
filter(Ingredient %in% names(cluster_membership)) %>%
mutate(cluster_membership = as.numeric(cluster_membership[Ingredient])) %>%
group_by(cluster_membership,approval_year) %>%
summarise(total_num_approvals = n())
<- expand.grid(approval_year = 1980:2020,
approval_table cluster_membership = 1:15,stringsAsFactors = FALSE) %>%
left_join(ob_table_summary)%>%
mutate(total_num_approvals = ifelse(is.na(total_num_approvals),0,total_num_approvals)) %>%
dcast(approval_year ~ cluster_membership)
row.names(approval_table) <- approval_table$approval_year
<- approval_table %>% select(-approval_year)
approval_table
# Convert approval signal to z-scores
<- apply(approval_table,2, function(x){
approval_table
return( (x - mean(x)) / sd(x))
})
par(mfrow = c(5,3))
for ( i in 1:length(unique(cluster_membership))){
# identify the columns that represent the series in the given cluster
<- colnames(series_zscored)[cluster_membership == i]
columns_selected
# List of series in the given cluster
<- series_zscored[,columns_selected]
temp <- c(temp)
series_in_cluster
# Get the Centroid for the given cluster
<- clustering@centroids[[i]]
cent
# plot each series in the cluster
plot(x = timeline, y = rep(1.1, length(timeline)), type = "n", ylab = "signal", ylim = c(min(temp),max(temp)),
main = paste0("Cluster ",i))
for (s in 1:length(series_in_cluster)){
lines(x = timeline, y = series_in_cluster[[s]], col = scales::alpha("grey",0.5))
}# Add Centroid
lines(x = timeline, y = cent, col = "red", lwd = 2)
<- list(series_in_cluster,centroid = cent)
series_in_cluster
<- series_in_cluster
cluster_series_with_centroids[[i]]
# Add total approvals per year
lines(x = timeline, y = approval_table[,i], col = "navy", lwd = 2)
legend("topleft", legend = c("Approvals","Cluster centroid"), lwd = 2,
col = c("navy","red"))
}
Again, I notice some degree of overlap, perhaps a lagged pattern, between select cluster centroids and total approval signal for the drug ingredients within a given cluster.