You are given the flow matrix containing citations among academic papers in disciplines. Rows and colums maps to disciplines and an entry in position \(i\) and \(j\) of the matrix corresponds to the absolute flow of citations from discipline \(i\) to discipline \(j\). Source: Quantifying the higher-order influence of scientific publications.
library(tidyverse)
library(igraph)
library(ggraph)
# discipline names
disc = scan("disciplines.txt", what = character(0), sep="\n")
# number of disciplines
ndisc = length(disc)
# size of disciplines (number of papers)
size_disc = scan("size.txt", what = numeric(0), sep="\n")
# flow matrix: entry in position (i,j) are citations from i to j
F = matrix(scan("flows.txt", what = numeric(0), sep=","), nrow = ndisc, ncol = ndisc)
# self flow: citations of the discipline to itself
self_flow = diag(F)
names(self_flow) = disc
# out flow: citations of the discipline to other disciplines
out_flow = rowSums(F) - self_flow
names(out_flow) = disc
# in flow: citations of other disciplines to the discipline
in_flow = colSums(F) - self_flow
names(in_flow) = disc
flows = tibble(discipline = disc, size = size_disc, self_flow = self_flow,
in_flow = in_flow, out_flow = out_flow) %>%
mutate(id = row_number()) %>%
select(id, everything())
flows
flows %>%
mutate (self = self_flow / size) %>%
arrange(desc(self)) %>%
select(discipline, self)
flows %>%
mutate (cited = in_flow / size) %>%
arrange(desc(cited)) %>%
select(discipline, cited)
The expected flow from node \(i\) to node \(j\) is \[\frac{out_i \cdot in_j}{n}\] with \(in_j\) the inflow of \(j\), \(out_i\) the outflow of \(i\), and \(n\) the total flow. Indeed \(in_j / n\) is the probability of flowing to \(j\), and hence \(out_i \cdot in_j/n\) is the expected flow coming from \(i\) and directed to \(j\).
# normalize by expected flows
R = diag(rowSums(F))
C = diag(colSums(F))
n = sum(c(F))
O = matrix(1, nrow = ndisc, ncol = ndisc)
# expected flow
E = (R %*% O %*% C) / n
# normalized flows (X-test)
C = (F - E) / sqrt(E)
# normalized flows (G-test)
D = F * log(F / E)
# tidy flow matrix: we consider undirected flow for each pair (i,j) by summing the directed flows i --> j and j --> i
g = graph_from_adjacency_matrix(C, mode = "plus", weighted = TRUE)
V(g)$name = 1:vcount(g)
# full graph
ggraph(g, layout = "circle") +
geom_edge_link() +
geom_node_point() +
coord_fixed() +
theme_graph()
# positive weights: more than expected
ggraph(g, layout = "circle") +
geom_edge_link(aes(edge_alpha = weight, filter = (weight > quantile(weight, 0.9)))) +
geom_node_point() +
geom_node_text(aes(label = name, x = x * 1.05, y = y * 1.05)) +
coord_fixed() +
theme_graph()
# negative weights: less than expected
ggraph(g, layout = "circle") +
geom_edge_link(aes(edge_alpha = -weight, filter = (weight < quantile(weight, 0.1)))) +
geom_node_point() +
geom_node_text(aes(label = name, x = x * 1.05, y = y * 1.05)) +
coord_fixed() +
theme_graph()
similarity = function(g, type = "cosine", mode = "col" ) {
A = as_adjacency_matrix(g, attr = "weight", sparse = FALSE)
if (mode == "row") {A = t(A)}
if (type == "cosine") {
euclidean = function(x) {sqrt(x %*% x)}
d = apply(A, 2, euclidean)
D = diag(1/d)
S = D %*% t(A) %*% A %*% D
}
if (type == "pearson") {
S = cor(A)
}
return(S)
}
shannon = function(p) {
x = p * log2(p)
x = replace(x, is.nan(x), 0)
return(-sum(x))
}
simpson = function(p) {
x = 1 - sum(p * p)
return(x)
}
rao = function(p, D) {
x = diag(p) %*% D %*% diag(p)
return(sum(c(x)))
}
heterogeneity = function(g, D, mode = "col") {
A = as_adjacency_matrix(g, attr = "weight", sparse = FALSE)
if (mode == "col") {
A = A %*% diag(1/colSums(A))
dim = 2
} else {
A = diag(1/rowSums(A)) %*% A
dim = 1
}
return(list(shannon = apply(A, dim, shannon),
simpson = apply(A, dim, simpson),
rao = apply(A, dim, rao, D)))
}
f = graph_from_adjacency_matrix(F, mode = "directed", weighted = TRUE)
V(f)$name = 1:vcount(f)
S = similarity(f)
D = 1 - S
het = heterogeneity(f, D)
flows = mutate(flows, rao = het$rao)
# top-5 interdisciplinary disciplines (Rao)
arrange(flows, desc(rao)) %>%
select(discipline, rao) %>%
head(5)
# top-5 autarchy disciplines (Rao)
arrange(flows, rao) %>%
select(discipline, rao) %>%
head(5)