Interdisciplinarity and autarchy in science

You are given the flow matrix containing citations among academic papers in disciplines. Rows and colums maps to disciplines and an entry in position \(i\) and \(j\) of the matrix corresponds to the absolute flow of citations from discipline \(i\) to discipline \(j\). Source: Quantifying the higher-order influence of scientific publications.

Challanges

What are the most self-citing disciplines? And the most cited ones?
Compare observed flows and expected ones
Visualize the flow network
Discover the most interdisciplinary disciplines using Rao quadratic entropy

library(tidyverse)
library(igraph)
library(ggraph)

Read dataset

# discipline names
disc = scan("disciplines.txt", what = character(0), sep="\n")

# number of disciplines
ndisc = length(disc)

# size of disciplines (number of papers)
size_disc = scan("size.txt", what = numeric(0), sep="\n")

# flow matrix: entry in position (i,j) are citations from i to j
F = matrix(scan("flows.txt", what = numeric(0), sep=","), nrow = ndisc, ncol = ndisc)

Absolute flows

# self flow: citations of the discipline to itself
self_flow = diag(F)
names(self_flow) = disc

# out flow: citations of the discipline to other disciplines
out_flow = rowSums(F) - self_flow
names(out_flow) = disc

# in flow: citations of other disciplines to the discipline
in_flow = colSums(F) - self_flow
names(in_flow) = disc

flows = tibble(discipline = disc, size = size_disc, self_flow = self_flow, 
               in_flow = in_flow, out_flow = out_flow) %>% 
  mutate(id = row_number()) %>%
  select(id, everything())

flows

What are the most self-citing disciplines? And the most cited ones?

flows %>% 
  mutate (self = self_flow / size) %>% 
  arrange(desc(self)) %>% 
  select(discipline, self)

flows %>% 
  mutate (cited = in_flow / size) %>% 
  arrange(desc(cited)) %>% 
  select(discipline, cited)

Compare observed flows to expected ones

The expected flow from node \(i\) to node \(j\) is \[\frac{out_i \cdot in_j}{n}\] with \(in_j\) the inflow of \(j\), \(out_i\) the outflow of \(i\), and \(n\) the total flow. Indeed \(in_j / n\) is the probability of flowing to \(j\), and hence \(out_i \cdot in_j/n\) is the expected flow coming from \(i\) and directed to \(j\).

# normalize by expected flows 
R = diag(rowSums(F))
C = diag(colSums(F))
n = sum(c(F))
O = matrix(1, nrow = ndisc, ncol = ndisc)
# expected flow
E = (R %*% O %*% C) / n
# normalized flows (X-test)
C = (F - E) / sqrt(E) 
# normalized flows (G-test)
D = F * log(F / E)

Visualize the flow network

# tidy flow matrix: we consider undirected flow for each pair (i,j) by summing the directed flows i --> j and j --> i
g = graph_from_adjacency_matrix(C, mode = "plus", weighted = TRUE)
V(g)$name = 1:vcount(g)


# full graph
ggraph(g, layout = "circle") + 
  geom_edge_link() + 
  geom_node_point() +
  coord_fixed() +
  theme_graph()

# positive weights: more than expected
ggraph(g, layout = "circle") + 
  geom_edge_link(aes(edge_alpha = weight, filter = (weight > quantile(weight, 0.9)))) + 
  geom_node_point() +
  geom_node_text(aes(label = name, x = x * 1.05, y = y * 1.05))  +
  coord_fixed() +
  theme_graph()

# negative weights: less than expected
ggraph(g, layout = "circle") + 
  geom_edge_link(aes(edge_alpha = -weight, filter = (weight < quantile(weight, 0.1)))) + 
  geom_node_point() +
  geom_node_text(aes(label = name, x = x * 1.05, y = y * 1.05))  +
  coord_fixed() +
  theme_graph()

Discover the most interdisciplinary/autarchical disciplines

similarity = function(g, type = "cosine", mode = "col" ) {
  A = as_adjacency_matrix(g, attr = "weight", sparse = FALSE)
  if (mode == "row") {A = t(A)}
  if (type == "cosine") {
    euclidean = function(x) {sqrt(x %*% x)}
    d = apply(A, 2, euclidean)
    D = diag(1/d)
    S = D %*% t(A) %*% A %*% D
  }
  if (type == "pearson") {
    S = cor(A)
  }
  return(S)
}

shannon = function(p) {
  x = p * log2(p)
  x = replace(x, is.nan(x), 0)
  return(-sum(x))
}

simpson = function(p) {
  x = 1 - sum(p * p)
  return(x)
}

rao = function(p, D) {
  x = diag(p) %*% D %*% diag(p)
  return(sum(c(x)))
}


heterogeneity = function(g, D, mode = "col") {
  A = as_adjacency_matrix(g, attr = "weight", sparse = FALSE)
  if (mode == "col") {
    A = A %*% diag(1/colSums(A))
    dim = 2 
  } else {
    A = diag(1/rowSums(A)) %*% A
    dim = 1 
  }
  return(list(shannon = apply(A, dim, shannon), 
              simpson = apply(A, dim, simpson), 
              rao = apply(A, dim, rao, D)))
}



f = graph_from_adjacency_matrix(F, mode = "directed", weighted = TRUE)
V(f)$name = 1:vcount(f)

S = similarity(f)
D = 1 - S
het = heterogeneity(f, D)

flows = mutate(flows, rao = het$rao)

# top-5 interdisciplinary disciplines (Rao)
arrange(flows, desc(rao)) %>% 
  select(discipline, rao) %>%
  head(5)

# top-5 autarchy disciplines (Rao)
arrange(flows, rao) %>% 
  select(discipline, rao) %>%
  head(5)