You are given the flows of (direct and indirect) citations among (academic papers in) disciplines. An entry in position \(i\) and \(j\) corresponds to the absolute flow of citations from discipline \(j\) to discipline \(i\) (note the direction of the flow).
library(tidyverse)
library(igraph)
library(ggraph)
library(corrplot)
# discipline names
disc = scan("disciplines.txt", what = character(0), sep="\n")
# number of disciplines
ndisc = length(disc)
# size of disciplines
size_disc = scan("size.txt", what = numeric(0), sep="\n")
# flow matrix
# Note about scan: a field is always delimited by an end-of-line marker unless it is quoted.
F = matrix(scan("flows.txt", what = numeric(0), sep=","), nrow = ndisc, ncol = ndisc, byrow = TRUE)
# self flow: citations of the discipline to itself
self_flow = diag(F)
names(self_flow) = disc
# in flow: citations of other disciplines to the discipline
in_flow = rowSums(F) - self_flow
names(in_flow) = disc
# out flow: citations of the discipline to other disciplines
out_flow = colSums(F) - self_flow
names(out_flow) = disc
flows = tibble(discipline = disc, size = size_disc, self_flow = self_flow, in_flow = in_flow, out_flow = out_flow) %>%
mutate(id = row_number()) %>%
select(id, everything())
As expected, size and absolute flows significantly correlates:
# size and flows have skewed distributions
ggplot(flows) +
geom_freqpoly(aes(x = size), bins = 30)
ggplot(flows) +
geom_freqpoly(aes(x = in_flow), bins = 30)
# select and log-transform flows
flows_log = select(flows, size:out_flow) %>%
mutate(size = log2(size), self_flow = log2(self_flow), in_flow = log2(in_flow), out_flow = log2(out_flow))
cor(flows_log)
## size self_flow in_flow out_flow
## size 1.0000000 0.9788099 0.9543103 0.9551879
## self_flow 0.9788099 1.0000000 0.9232361 0.8879798
## in_flow 0.9543103 0.9232361 1.0000000 0.9600901
## out_flow 0.9551879 0.8879798 0.9600901 1.0000000
corrplot.mixed(cor(flows_log), upper = "ellipse")
ggplot(flows_log, aes(x = size, y = in_flow)) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'loess'
# normalize by discipline size
flows = flows %>%
mutate(self_rel_flow = self_flow / size, in_rel_flow = in_flow / size, out_rel_flow = out_flow / size)
# top-5 disciplines for relative in-flow
arrange(flows, desc(in_rel_flow)) %>% head(5)
# top-5 disciplines for relative out-flow
arrange(flows, desc(out_rel_flow)) %>% head(5)
# top-5 disciplines for relative self-flow
arrange(flows, desc(self_rel_flow)) %>% head(5)
discipline = "MATHEMATICS"
id = match(discipline, disc)
math_in = F[id, ] / sum(F[id, ])
names(math_in) = disc
math_out = F[ ,id] / sum(F[ ,id])
names(math_out) = disc
discipline = "COMPUTER SCIENCES"
id = match(discipline, disc)
cs_in = F[id, ] / sum(F[id, ])
names(cs_in) = disc
cs_out = F[ ,id] / sum(F[ ,id])
names(cs_out) = disc
# top-5 sources for mathematics
sort(math_in, decreasing = TRUE) %>% head(5)
## MATHEMATICS
## 0.71252273
## PHYSICS AND MATERIALS SCIENCE
## 0.06684742
## COMPUTER SCIENCES
## 0.05625185
## STATISTICAL SCIENCES
## 0.02785838
## ELECTRICAL ENGINEERING AND TELECOMMUNICATION
## 0.02769001
# top-5 sources for computer science
sort(cs_in, decreasing = TRUE) %>% head(5)
## COMPUTER SCIENCES
## 0.55081569
## ELECTRICAL ENGINEERING AND TELECOMMUNICATION
## 0.12708097
## MATHEMATICS
## 0.05671132
## PHYSICS AND MATERIALS SCIENCE
## 0.05087917
## STATISTICAL SCIENCES
## 0.02252808
# top-5 destinations for mathematics
sort(math_out, decreasing = TRUE) %>% head(5)
## MATHEMATICS
## 0.70776829
## PHYSICS AND MATERIALS SCIENCE
## 0.08492858
## COMPUTER SCIENCES
## 0.04423225
## STATISTICAL SCIENCES
## 0.03136009
## ELECTRICAL ENGINEERING AND TELECOMMUNICATION
## 0.02612007
# top-5 destinations for computer science
sort(cs_out, decreasing = TRUE) %>% head(5)
## COMPUTER SCIENCES
## 0.48999621
## ELECTRICAL ENGINEERING AND TELECOMMUNICATION
## 0.13805169
## MATHEMATICS
## 0.06373035
## PHYSICS AND MATERIALS SCIENCE
## 0.04636561
## STATISTICAL SCIENCES
## 0.03810150
# normalize by expected flows
R = diag(rowSums(F))
C = diag(colSums(F))
n = sum(diag(C))
O = matrix(1, nrow = ndisc, ncol = ndisc)
# expected flows
E = (R %*% O %*% C) / n
# normalized flows (X-test)
C = (F - E) / sqrt(E)
# normalized flows (G-test)
D = F * log(F / E)
# tidy flow matrix
# Note: the weight of undirected edges is the sum weights of directed edge
g = graph_from_adjacency_matrix(C, mode = "plus", weighted = TRUE)
V(g)$name = 1:vcount(g)
chi = as.tibble(as_data_frame(g, what = "edges")) %>%
mutate(from = as.integer(from), to = as.integer(to)) %>%
arrange(desc(weight))
chi = chi %>%
left_join(flows, by = c("from" = "id")) %>%
left_join(flows, by = c("to" = "id")) %>%
select(from, to, discipline.from = discipline.x, discipline.to = discipline.y, size.from = size.x, size.to = size.y, weight) %>%
arrange(desc(weight))
# top-5 same discipline pairs
filter(chi, from == to) %>% head(5)
# bottom-5 same discipline pairs
filter(chi, from == to) %>%
arrange(weight) %>%
head(5)
# top-10 different discipline pairs
filter(chi, from != to) %>% head(10)
# bottom-10 different discipline pairs
filter(chi, from != to) %>%
arrange(weight) %>%
head(10)
# tidy flow matrix
# Note: the weight of undirected edges is the sum weights of directed edge
g2 = graph_from_adjacency_matrix(D, mode = "plus", weighted = TRUE)
V(g2)$name = 1:vcount(g2)
gi = as.tibble(as_data_frame(g2, what = "edges")) %>%
mutate(from = as.integer(from), to = as.integer(to)) %>%
arrange(desc(weight))
gi = gi %>%
left_join(flows, by = c("from" = "id")) %>%
left_join(flows, by = c("to" = "id")) %>%
select(from, to, discipline.from = discipline.x, discipline.to = discipline.y, size.from = size.x, size.to = size.y, weight) %>%
arrange(desc(weight))
# top-5 same discipline pairs
filter(gi, from == to) %>% head(5)
# bottom-5 same discipline pairs
filter(gi, from == to) %>%
arrange(weight) %>%
head(5)
# top-10 different discipline pairs
filter(gi, from != to) %>% head(10)
# bottom-10 different discipline pairs
filter(gi, from != to) %>%
arrange(weight) %>%
head(10)
set_graph_style()
# artwork
ggraph(g, layout = "circle") +
geom_edge_link() +
geom_node_point()
# positive weights: more than expected
ggraph(g, layout = "circle") +
geom_edge_link(aes(edge_alpha = weight, filter = (weight > 0))) +
geom_node_point() +
geom_node_text(aes(label = name, x = x * 1.05, y = y * 1.05))
# negative weights: less than expected
ggraph(g, layout = "circle") +
geom_edge_link(aes(edge_alpha = -weight, filter = (weight < 0))) +
geom_node_point() +
geom_node_text(aes(label = name, x = x * 1.05, y = y * 1.05))
# X-test
# more than 90th percentile
q90 = quantile(E(g)$weight, 0.90)
ggraph(g, layout = "circle") +
geom_edge_link(aes(edge_alpha = weight, edge_width = weight, filter = (weight > q90))) +
geom_node_point() +
geom_node_text(aes(label = name, x = x * 1.05, y = y * 1.05))
# G-test
# more than 90th percentile
q90 = quantile(E(g2)$weight, 0.90)
ggraph(g2, layout = "circle") +
geom_edge_link(aes(edge_alpha = weight, edge_width = weight, filter = (weight > q90))) +
geom_node_point() +
geom_node_text(aes(label = name, x = x * 1.05, y = y * 1.05))
# X-test
# less than 10th percentile
q10 = quantile(E(g)$weight, 0.10)
ggraph(g, layout = "circle") +
geom_edge_link(aes(edge_alpha = -weight, edge_width = -weight, filter = (weight < q10))) +
geom_node_point() +
geom_node_text(aes(label = name, x = x * 1.05, y = y * 1.05))
# G-test
# less than 10th percentile
q10 = quantile(E(g2)$weight, 0.10)
ggraph(g2, layout = "circle") +
geom_edge_link(aes(edge_alpha = -weight, edge_width = -weight, filter = (weight < q10))) +
geom_node_point() +
geom_node_text(aes(label = name, x = x * 1.05, y = y * 1.05))
similarity = function(g, type = "cosine", mode = "col" ) {
A = as_adjacency_matrix(g, attr = "weight", sparse = FALSE)
if (mode == "row") {A = t(A)}
if (type == "cosine") {
euclidean = function(x) {sqrt(x %*% x)}
d = apply(A, 2, euclidean)
D = diag(1/d)
S = D %*% t(A) %*% A %*% D
}
if (type == "pearson") {
S = cor(A)
}
return(S)
}
shannon = function(p) {
x = p * log2(p)
x = replace(x, is.nan(x), 0)
return(-sum(x))
}
simpson = function(p) {
x = 1 - sum(p * p)
return(x)
}
rao = function(p, D) {
x = diag(p) %*% D %*% diag(p)
return(sum(c(x)))
}
heterogeneity = function(g, D, mode = "col") {
A = as_adjacency_matrix(g, attr = "weight", sparse = FALSE)
if (mode == "col") {
A = A %*% diag(1/colSums(A))
dim = 2
} else {
A = diag(1/rowSums(A)) %*% A
dim = 1
}
return(list(shannon = apply(A, dim, shannon),
simpson = apply(A, dim, simpson),
rao = apply(A, dim, rao, D)))
}
f = graph_from_adjacency_matrix(F, mode = "directed", weighted = TRUE)
V(f)$name = 1:vcount(f)
S = similarity(f)
D = 1 - S
het = heterogeneity(f, D)
flows = mutate(flows, shannon_id = het$shannon, simpson_id = het$simpson, rao_id = het$rao)
flows_id = select(flows, contains("_id"))
corrplot.mixed(cor(flows_id), upper = "ellipse")
# top-5 interdisciplinary disciplines (Shannon)
arrange(flows, desc(shannon_id)) %>%
select(discipline, contains("_id")) %>%
head(5)
# top-5 interdisciplinary disciplines (Simpson)
arrange(flows, desc(simpson_id)) %>%
select(discipline, contains("_id")) %>%
head(5)
# top-5 interdisciplinary disciplines (Rao)
arrange(flows, desc(rao_id)) %>%
select(discipline, contains("_id")) %>%
head(5)
# top-5 autarchy disciplines (Shannon)
arrange(flows, shannon_id) %>%
select(discipline, contains("_id")) %>%
head(5)
# top-5 autarchy disciplines (Simpson)
arrange(flows, simpson_id) %>%
select(discipline, contains("_id")) %>%
head(5)
# top-5 autarchy disciplines (Rao)
arrange(flows, rao_id) %>%
select(discipline, contains("_id")) %>%
head(5)
# distance object
d = as.dist(D)
# average-linkage clustering method
hc = hclust(d, method = "average")
# plot dendrogram
plot(hc)
set_graph_style()
ggraph(as.dendrogram(hc), layout = 'dendrogram', circular = TRUE) +
geom_edge_diagonal() +
geom_node_text(aes(filter = leaf, label = label, x = x*1.03, y=y*1.03), size = 3)
colnames(D) = disc
rownames(D) = disc
d = as.dist(D)
hc2 = hclust(d, method = "average")
set_graph_style()
ggraph(as.dendrogram(hc2), layout = 'dendrogram', circular = TRUE) +
geom_edge_diagonal() +
geom_node_text(aes(filter = leaf, label = label, x = x*1.03, y=y*1.03), size = 2)