Dataset

You are given the flows of (direct and indirect) citations among (academic papers in) disciplines. An entry in position \(i\) and \(j\) corresponds to the absolute flow of citations from discipline \(j\) to discipline \(i\) (note the direction of the flow).

  1. The discipline names and the discipline sizes
  2. The absolute flows (one row per line)

Challanges

  1. analyse the absolute flows for the different disciplines. Are they correlated to discipline sizes?
  2. find the relative flows (normalize with respect to discipline size)
  3. compare disciplines mathematics and computer science. How do they differ in their flows?
  4. compare observed flows to expected ones and find flows with largest positive and negative differences
  5. visualize the flow network among disciplines
  6. discover the most interdisciplinary disciplines as well as the most autarchical ones using heterogeneity methods and tools
  7. use hierarchical clustering to group disciplines into macro disciplines
library(tidyverse)
library(igraph)
library(ggraph)
library(corrplot)

Read dataset

# discipline names
disc = scan("disciplines.txt", what = character(0), sep="\n")
# number of disciplines
ndisc = length(disc)
# size of disciplines
size_disc = scan("size.txt", what = numeric(0), sep="\n")
# flow matrix
# Note about scan: a field is always delimited by an end-of-line marker unless it is quoted.
F = matrix(scan("flows.txt", what = numeric(0), sep=","), nrow = ndisc, ncol = ndisc, byrow = TRUE)

Absolute flows

# self flow: citations of the discipline to itself
self_flow = diag(F)
names(self_flow) = disc

# in flow: citations of other disciplines to the discipline
in_flow = rowSums(F) - self_flow
names(in_flow) = disc

# out flow: citations of the discipline to other disciplines
out_flow = colSums(F) - self_flow
names(out_flow) = disc

flows = tibble(discipline = disc, size = size_disc, self_flow = self_flow, in_flow = in_flow, out_flow = out_flow) %>% 
  mutate(id = row_number()) %>%
  select(id, everything())

As expected, size and absolute flows significantly correlates:

# size and flows have skewed distributions
ggplot(flows) +
  geom_freqpoly(aes(x = size), bins = 30)

ggplot(flows) +
  geom_freqpoly(aes(x = in_flow), bins = 30)

# select and log-transform flows
flows_log = select(flows, size:out_flow) %>% 
  mutate(size = log2(size), self_flow = log2(self_flow), in_flow = log2(in_flow), out_flow = log2(out_flow))

cor(flows_log)
##                size self_flow   in_flow  out_flow
## size      1.0000000 0.9788099 0.9543103 0.9551879
## self_flow 0.9788099 1.0000000 0.9232361 0.8879798
## in_flow   0.9543103 0.9232361 1.0000000 0.9600901
## out_flow  0.9551879 0.8879798 0.9600901 1.0000000
corrplot.mixed(cor(flows_log), upper = "ellipse")

ggplot(flows_log, aes(x = size, y = in_flow)) +
  geom_point() + 
  geom_smooth()
## `geom_smooth()` using method = 'loess'

Relative flows

# normalize by discipline size
flows = flows %>% 
  mutate(self_rel_flow = self_flow / size, in_rel_flow = in_flow / size, out_rel_flow = out_flow / size)

# top-5 disciplines for relative in-flow
arrange(flows, desc(in_rel_flow)) %>% head(5)
# top-5 disciplines for relative out-flow
arrange(flows, desc(out_rel_flow)) %>% head(5)
# top-5 disciplines for relative self-flow
arrange(flows, desc(self_rel_flow)) %>% head(5)

Compare flows for disciplines mathematics and computer science

discipline = "MATHEMATICS"
id = match(discipline, disc)
math_in = F[id, ] / sum(F[id, ])
names(math_in) = disc
math_out = F[ ,id] / sum(F[ ,id])
names(math_out) = disc

discipline = "COMPUTER SCIENCES"
id = match(discipline, disc)
cs_in = F[id, ] / sum(F[id, ])
names(cs_in) = disc
cs_out = F[ ,id] / sum(F[ ,id])
names(cs_out) = disc


# top-5 sources for mathematics
sort(math_in, decreasing = TRUE) %>% head(5)
##                                  MATHEMATICS 
##                                   0.71252273 
##                PHYSICS AND MATERIALS SCIENCE 
##                                   0.06684742 
##                            COMPUTER SCIENCES 
##                                   0.05625185 
##                         STATISTICAL SCIENCES 
##                                   0.02785838 
## ELECTRICAL ENGINEERING AND TELECOMMUNICATION 
##                                   0.02769001
# top-5 sources for computer science
sort(cs_in, decreasing = TRUE) %>% head(5)
##                            COMPUTER SCIENCES 
##                                   0.55081569 
## ELECTRICAL ENGINEERING AND TELECOMMUNICATION 
##                                   0.12708097 
##                                  MATHEMATICS 
##                                   0.05671132 
##                PHYSICS AND MATERIALS SCIENCE 
##                                   0.05087917 
##                         STATISTICAL SCIENCES 
##                                   0.02252808
# top-5 destinations for mathematics
sort(math_out, decreasing = TRUE) %>% head(5)
##                                  MATHEMATICS 
##                                   0.70776829 
##                PHYSICS AND MATERIALS SCIENCE 
##                                   0.08492858 
##                            COMPUTER SCIENCES 
##                                   0.04423225 
##                         STATISTICAL SCIENCES 
##                                   0.03136009 
## ELECTRICAL ENGINEERING AND TELECOMMUNICATION 
##                                   0.02612007
# top-5 destinations for computer science
sort(cs_out, decreasing = TRUE) %>% head(5)
##                            COMPUTER SCIENCES 
##                                   0.48999621 
## ELECTRICAL ENGINEERING AND TELECOMMUNICATION 
##                                   0.13805169 
##                                  MATHEMATICS 
##                                   0.06373035 
##                PHYSICS AND MATERIALS SCIENCE 
##                                   0.04636561 
##                         STATISTICAL SCIENCES 
##                                   0.03810150

Compare observed flows to expected ones

# normalize by expected flows 
R = diag(rowSums(F))
C = diag(colSums(F))
n = sum(diag(C))
O = matrix(1, nrow = ndisc, ncol = ndisc)
# expected flows
E = (R %*% O %*% C) / n
# normalized flows (X-test)
C = (F - E) / sqrt(E) 

# normalized flows (G-test)
D = F * log(F / E)


# tidy flow matrix
# Note: the weight of undirected edges is the sum weights of directed edge 
g = graph_from_adjacency_matrix(C, mode = "plus", weighted = TRUE)
V(g)$name = 1:vcount(g)

chi = as.tibble(as_data_frame(g, what = "edges")) %>% 
  mutate(from = as.integer(from), to = as.integer(to)) %>%
  arrange(desc(weight))

chi = chi %>% 
  left_join(flows, by = c("from" = "id")) %>%
  left_join(flows, by = c("to" = "id")) %>%
  select(from, to, discipline.from = discipline.x, discipline.to = discipline.y, size.from = size.x, size.to = size.y, weight) %>%
  arrange(desc(weight))


# top-5 same discipline pairs
filter(chi, from == to) %>% head(5)
# bottom-5 same discipline pairs
filter(chi, from == to) %>% 
  arrange(weight) %>%
  head(5)
# top-10 different discipline pairs
filter(chi, from != to) %>% head(10)
# bottom-10 different discipline pairs
filter(chi, from != to) %>% 
  arrange(weight) %>%
  head(10)
# tidy flow matrix
# Note: the weight of undirected edges is the sum weights of directed edge 
g2 = graph_from_adjacency_matrix(D, mode = "plus", weighted = TRUE)
V(g2)$name = 1:vcount(g2)

gi = as.tibble(as_data_frame(g2, what = "edges")) %>% 
  mutate(from = as.integer(from), to = as.integer(to)) %>%
  arrange(desc(weight))

gi = gi %>% 
  left_join(flows, by = c("from" = "id")) %>%
  left_join(flows, by = c("to" = "id")) %>%
  select(from, to, discipline.from = discipline.x, discipline.to = discipline.y, size.from = size.x, size.to = size.y, weight) %>%
  arrange(desc(weight))

# top-5 same discipline pairs
filter(gi, from == to) %>% head(5)
# bottom-5 same discipline pairs
filter(gi, from == to) %>% 
  arrange(weight) %>%
  head(5)
# top-10 different discipline pairs
filter(gi, from != to) %>% head(10)
# bottom-10 different discipline pairs
filter(gi, from != to) %>% 
  arrange(weight) %>%
  head(10)

Visualize the flow network

set_graph_style()


# artwork
ggraph(g, layout = "circle") + 
  geom_edge_link() + 
  geom_node_point() 

# positive weights: more than expected
ggraph(g, layout = "circle") + 
  geom_edge_link(aes(edge_alpha = weight, filter = (weight > 0))) + 
  geom_node_point() +
  geom_node_text(aes(label = name, x = x * 1.05, y = y * 1.05))

# negative weights: less than expected
ggraph(g, layout = "circle") + 
  geom_edge_link(aes(edge_alpha = -weight, filter = (weight < 0))) + 
  geom_node_point() +
  geom_node_text(aes(label = name, x = x * 1.05, y = y * 1.05))

# X-test
# more than 90th percentile
q90 = quantile(E(g)$weight, 0.90)
ggraph(g, layout = "circle") + 
  geom_edge_link(aes(edge_alpha = weight, edge_width = weight, filter = (weight > q90))) + 
  geom_node_point() +
  geom_node_text(aes(label = name, x = x * 1.05, y = y * 1.05))

# G-test
# more than 90th percentile
q90 = quantile(E(g2)$weight, 0.90)
ggraph(g2, layout = "circle") + 
  geom_edge_link(aes(edge_alpha = weight, edge_width = weight, filter = (weight > q90))) + 
  geom_node_point() +
  geom_node_text(aes(label = name, x = x * 1.05, y = y * 1.05))

# X-test
# less than 10th percentile
q10 = quantile(E(g)$weight, 0.10)
ggraph(g, layout = "circle") + 
  geom_edge_link(aes(edge_alpha = -weight, edge_width = -weight, filter = (weight < q10))) + 
  geom_node_point() +
  geom_node_text(aes(label = name, x = x * 1.05, y = y * 1.05))

# G-test
# less than 10th percentile
q10 = quantile(E(g2)$weight, 0.10)
ggraph(g2, layout = "circle") + 
  geom_edge_link(aes(edge_alpha = -weight, edge_width = -weight, filter = (weight < q10))) + 
  geom_node_point() +
  geom_node_text(aes(label = name, x = x * 1.05, y = y * 1.05))

Discover the most interdisciplinary disciplines as well as the most autarchical ones

similarity = function(g, type = "cosine", mode = "col" ) {
  A = as_adjacency_matrix(g, attr = "weight", sparse = FALSE)
  if (mode == "row") {A = t(A)}
  if (type == "cosine") {
    euclidean = function(x) {sqrt(x %*% x)}
    d = apply(A, 2, euclidean)
    D = diag(1/d)
    S = D %*% t(A) %*% A %*% D
  }
  if (type == "pearson") {
    S = cor(A)
  }
  return(S)
}

shannon = function(p) {
  x = p * log2(p)
  x = replace(x, is.nan(x), 0)
  return(-sum(x))
}

simpson = function(p) {
  x = 1 - sum(p * p)
  return(x)
}

rao = function(p, D) {
  x = diag(p) %*% D %*% diag(p)
  return(sum(c(x)))
}


heterogeneity = function(g, D, mode = "col") {
  A = as_adjacency_matrix(g, attr = "weight", sparse = FALSE)
  if (mode == "col") {
    A = A %*% diag(1/colSums(A))
    dim = 2 
  } else {
    A = diag(1/rowSums(A)) %*% A
    dim = 1 
  }
  return(list(shannon = apply(A, dim, shannon), 
              simpson = apply(A, dim, simpson), 
              rao = apply(A, dim, rao, D)))
}



f = graph_from_adjacency_matrix(F, mode = "directed", weighted = TRUE)
V(f)$name = 1:vcount(f)

S = similarity(f)
D = 1 - S
het = heterogeneity(f, D)

flows = mutate(flows, shannon_id = het$shannon, simpson_id = het$simpson, rao_id = het$rao)

flows_id = select(flows, contains("_id"))
corrplot.mixed(cor(flows_id), upper = "ellipse")

# top-5 interdisciplinary disciplines (Shannon)
arrange(flows, desc(shannon_id)) %>% 
  select(discipline, contains("_id")) %>%
  head(5)
# top-5 interdisciplinary disciplines (Simpson)
arrange(flows, desc(simpson_id)) %>% 
  select(discipline, contains("_id")) %>%
  head(5)
# top-5 interdisciplinary disciplines (Rao)
arrange(flows, desc(rao_id)) %>% 
  select(discipline, contains("_id")) %>%
  head(5)
# top-5 autarchy disciplines (Shannon)
arrange(flows, shannon_id) %>% 
  select(discipline, contains("_id")) %>%
  head(5)
# top-5 autarchy disciplines (Simpson)
arrange(flows, simpson_id) %>% 
  select(discipline, contains("_id")) %>%
  head(5)
# top-5 autarchy disciplines (Rao)
arrange(flows, rao_id) %>% 
  select(discipline, contains("_id")) %>%
  head(5)

Use hierarchical clustering to group disciplines into macro disciplines

# distance object
d = as.dist(D)

# average-linkage clustering method
hc = hclust(d, method = "average")

# plot dendrogram
plot(hc)

set_graph_style()
ggraph(as.dendrogram(hc), layout = 'dendrogram', circular = TRUE) + 
  geom_edge_diagonal() +
  geom_node_text(aes(filter = leaf, label = label, x = x*1.03, y=y*1.03), size = 3)

colnames(D) = disc
rownames(D) = disc
d = as.dist(D)
hc2 = hclust(d, method = "average")

set_graph_style()
ggraph(as.dendrogram(hc2), layout = 'dendrogram', circular = TRUE) + 
  geom_edge_diagonal() +
  geom_node_text(aes(filter = leaf, label = label, x = x*1.03, y=y*1.03), size = 2)