You are given the flow matrix containing citations among academic papers in disciplines. Rows and colums maps to disciplines and an entry in position \(i\) and \(j\) of the matrix corresponds to the absolute flow of citations from discipline \(i\) to discipline \(j\). Source: Quantifying the higher-order influence of scientific publications.

  1. The discipline names and the discipline sizes
  2. The absolute flows (one row per line)

Challanges

  1. compare observed flows to expected ones
  2. visualize the flow network
  3. discover the most interdisciplinary disciplines using Rao quadratic entropy
library(tidyverse)
library(igraph)
library(ggraph)

Read dataset

# discipline names
disc = scan("disciplines.txt", what = character(0), sep="\n")

# number of disciplines
ndisc = length(disc)

# size of disciplines (number of papers)
size_disc = scan("size.txt", what = numeric(0), sep="\n")

# flow matrix: entry in position (i,j) are citations from i to j
F = matrix(scan("flows.txt", what = numeric(0), sep=","), nrow = ndisc, ncol = ndisc)

Absolute flows

# self flow: citations of the discipline to itself
self_flow = diag(F)
names(self_flow) = disc

# out flow: citations of the discipline to other disciplines
out_flow = rowSums(F) - self_flow
names(out_flow) = disc

# in flow: citations of other disciplines to the discipline
in_flow = colSums(F) - self_flow
names(in_flow) = disc

flows = tibble(discipline = disc, size = size_disc, self_flow = self_flow, 
               in_flow = in_flow, out_flow = out_flow) %>% 
  mutate(id = row_number()) %>%
  select(id, everything())

flows

Compare observed flows to expected ones

The expected flow from node \(i\) to node \(j\) is \[\frac{out_i \cdot in_j}{n}\] with \(in_j\) the inflow of \(j\), \(out_i\) the outflow of \(i\), and \(n\) the total flow. Indeed \(in_j / n\) is the probability of flowing to \(j\), and hence \(out_i \cdot in_j/n\) is the expected flow coming from \(i\) and directed to \(j\).

# normalize by expected flows 
R = diag(rowSums(F))
C = diag(colSums(F))
n = sum(c(F))
O = matrix(1, nrow = ndisc, ncol = ndisc)
# expected flow
E = (R %*% O %*% C) / n
# normalized flows (X-test)
C = (F - E) / sqrt(E) 
# normalized flows (G-test)
D = F * log(F / E)

Visualize the flow network

# tidy flow matrix: we consider undirected flow for each pair (i,j) by summing the directed flows i --> j and j --> i
g = graph_from_adjacency_matrix(C, mode = "plus", weighted = TRUE)
V(g)$name = 1:vcount(g)


# full graph
ggraph(g, layout = "circle") + 
  geom_edge_link() + 
  geom_node_point() +
  coord_fixed() +
  theme_graph()

# positive weights: more than expected
ggraph(g, layout = "circle") + 
  geom_edge_link(aes(edge_alpha = weight, filter = (weight > quantile(weight, 0.9)))) + 
  geom_node_point() +
  geom_node_text(aes(label = name, x = x * 1.05, y = y * 1.05))  +
  coord_fixed() +
  theme_graph()

# negative weights: less than expected
ggraph(g, layout = "circle") + 
  geom_edge_link(aes(edge_alpha = -weight, filter = (weight < quantile(weight, 0.1)))) + 
  geom_node_point() +
  geom_node_text(aes(label = name, x = x * 1.05, y = y * 1.05))  +
  coord_fixed() +
  theme_graph()

Discover the most interdisciplinary/autarchical disciplines

similarity = function(g, type = "cosine", mode = "col" ) {
  A = as_adjacency_matrix(g, attr = "weight", sparse = FALSE)
  if (mode == "row") {A = t(A)}
  if (type == "cosine") {
    euclidean = function(x) {sqrt(x %*% x)}
    d = apply(A, 2, euclidean)
    D = diag(1/d)
    S = D %*% t(A) %*% A %*% D
  }
  if (type == "pearson") {
    S = cor(A)
  }
  return(S)
}

shannon = function(p) {
  x = p * log2(p)
  x = replace(x, is.nan(x), 0)
  return(-sum(x))
}

simpson = function(p) {
  x = 1 - sum(p * p)
  return(x)
}

rao = function(p, D) {
  x = diag(p) %*% D %*% diag(p)
  return(sum(c(x)))
}


heterogeneity = function(g, D, mode = "col") {
  A = as_adjacency_matrix(g, attr = "weight", sparse = FALSE)
  if (mode == "col") {
    A = A %*% diag(1/colSums(A))
    dim = 2 
  } else {
    A = diag(1/rowSums(A)) %*% A
    dim = 1 
  }
  return(list(shannon = apply(A, dim, shannon), 
              simpson = apply(A, dim, simpson), 
              rao = apply(A, dim, rao, D)))
}



f = graph_from_adjacency_matrix(F, mode = "directed", weighted = TRUE)
V(f)$name = 1:vcount(f)

S = similarity(f)
D = 1 - S
het = heterogeneity(f, D)

flows = mutate(flows, rao = het$rao)

# top-5 interdisciplinary disciplines (Rao)
arrange(flows, desc(rao)) %>% 
  select(discipline, rao) %>%
  head(5)
# top-5 autarchy disciplines (Rao)
arrange(flows, rao) %>% 
  select(discipline, rao) %>%
  head(5)