Jose A. Rodriguez of the University of Barcelona created a network of the individuals involved in the bombing of commuter trains in Madrid on March 11, 2004. Rodriguez used press accounts in the two major Spanish daily newspapers (El Pais and El Mundo) to reconstruct the terrorist network. The names included were of those people suspected of having participated and their relatives. Rodriguez specified 4 kinds of ties linking the individuals involved:

These four were added together providing a strength of connection index that ranges from 1 to 4.

Dataset

Data challenges

  1. Who are the most connected terrorists? And the isolated ones?
  2. Who are the terrorists with the highest connection strength?
  3. Devise a ggplot visualization of the connections and weights among terrorists (Hint: use a scatterplot). Is the devised visualization useful to identify highly connected groups of terrorists? Is it useful to spot pairs of terrorists having similar connection patterns?
  4. Use similarity among pairs of terrorists to detect the most similar individuals
  5. Use ggplot to visualize similarity among the most similar pairs
  6. Use clustering and dissimilarity as distance to discover the terrorist cells
library(tidyverse)
library(igraph)
library(ggraph)

Read dataset

terrorists = read_csv("names.csv")
## Parsed with column specification:
## cols(
##   name = col_character()
## )
ties = read_csv("edges.csv")
## Parsed with column specification:
## cols(
##   from = col_integer(),
##   to = col_integer(),
##   weight = col_integer()
## )
terrorists = mutate(terrorists, id = 1:nrow(terrorists)) %>% select(id, everything())

# make graph
#g = graph_from_data_frame(ties, directed = FALSE, vertices = terrorists)
g = graph_from_data_frame(ties, directed = FALSE, vertices = tibble(1:nrow(terrorists)))
#g = delete_vertex_attr(g, "name")

The most dangerous terrorists

# degree and strength
terrorists = 
  terrorists %>% 
  mutate(degree = degree(g), strength = strength(g))

# most connected terrorists
arrange(terrorists, desc(degree))
## # A tibble: 70 x 4
##       id               name degree strength
##    <int>              <chr>  <dbl>    <dbl>
##  1     1       Jamal Zougam     29       43
##  2     3     Mohamed Chaoui     27       34
##  3     7 Imad Eddin Barakat     22       35
##  4    11         Amer Azizi     18       27
##  5    41        Said Berrak     17       19
##  6    18       Galeb Kalaje     16       21
##  7    24  Naima Oulad Akcha     16       16
##  8    19   Abderrahim Zbakh     15       15
##  9    31      Jamal Ahmidan     14       14
## 10    61 Mohamed El Egipcio     13       14
## # ... with 60 more rows
# most strongly connected terrorists
arrange(terrorists, desc(strength))
## # A tibble: 70 x 4
##       id               name degree strength
##    <int>              <chr>  <dbl>    <dbl>
##  1     1       Jamal Zougam     29       43
##  2     7 Imad Eddin Barakat     22       35
##  3     3     Mohamed Chaoui     27       34
##  4    11         Amer Azizi     18       27
##  5    18       Galeb Kalaje     16       21
##  6    15   Mohamed Belfatmi     11       19
##  7    41        Said Berrak     17       19
##  8    16        Said Bahaji     11       17
##  9    24  Naima Oulad Akcha     16       16
## 10    19   Abderrahim Zbakh     15       15
## # ... with 60 more rows
# isolated terrorists
filter(terrorists, degree == 0)
## # A tibble: 6 x 4
##      id                    name degree strength
##   <int>                   <chr>  <dbl>    <dbl>
## 1    17              Al? Amrous      0        0
## 2    26    Abdelhalak Bentasser      0        0
## 3    29           Faisal Alluch      0        0
## 4    46 Mohamad Bard Ddin Akkab      0        0
## 5    47            Abu Zubaidah      0        0
## 6    48         Sanel Sjekirika      0        0

Similar pairs of terrorists

ggplot(ties, aes(x = from, y = to, color = as.factor(weight))) +
  geom_point() +
  labs(color = "weigth")

# add reciprocal edges
ties2 = mutate(ties, temp = to, to = from, from = temp) %>% select(-temp)
ties2 = rbind(ties, ties2)
ggplot(ties2, aes(x = from, y = to, color = as.factor(weight))) +
  geom_point() +
  labs(color = "weigth")

ggplot(ties2, aes(x = from, y = to, alpha = weight)) +
  geom_point() 

# remove isolated nodes
isolated = filter(terrorists, degree == 0)$id
g = delete_vertices(g, isolated)


A = as_adjacency_matrix(g, attr = "weight", sparse = FALSE)

vcount(g)
## [1] 64
# strength
sum(strength(g) == rowSums(A))
## [1] 64
# degree
sum(degree(g) == rowSums(A > 0))
## [1] 64
# similarity as (Pearson) correlation among columns
S = cor(A)

# remove self similarity
S = S + diag(-1, nrow(A))

# some statistics
summary(c(S))
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.27574 -0.07729 -0.03336  0.07074  0.17037  1.00000
# tidy similarity matrix
h = graph_from_adjacency_matrix(S, mode = "undirected", weighted = TRUE)

sim = as.tibble(as_data_frame(h, what = "edges")) %>% 
  mutate(from = as.integer(from), to = as.integer(to)) %>%
  arrange(desc(weight))


# join sim with terrorists
sim_joint = sim %>% 
  left_join(terrorists, c("from" = "id")) %>% 
  left_join(terrorists, c("to" = "id")) %>%
  filter(degree.x >= 10, degree.y >= 10) %>%
  select(-strength.x, -strength.y)

# most similar pairs
head(sim_joint)
## # A tibble: 6 x 7
##    from    to    weight              name.x degree.x           name.y
##   <int> <int>     <dbl>               <chr>    <dbl>            <chr>
## 1    13    14 0.9061526        Mohamed Atta       10 Ramzi Binalshibh
## 2     4     5 0.8814815         Vinay Kholy       10     Suresh Kumar
## 3    12    58 0.8553506 Abu Musad Alsakaoui       10           Shakur
## 4    12    13 0.8471779 Abu Musad Alsakaoui       10     Mohamed Atta
## 5    12    15 0.8363355 Abu Musad Alsakaoui       10 Mohamed Belfatmi
## 6     4    28 0.8305289         Vinay Kholy       10    Basel Ghayoun
## # ... with 1 more variables: degree.y <dbl>
# most dissimilar pairs
tail(sim_joint)
## # A tibble: 6 x 7
##    from    to     weight              name.x degree.x          name.y
##   <int> <int>      <dbl>               <chr>    <dbl>           <chr>
## 1    16    63 -0.1881146         Said Bahaji       11 Semaan Gaby Eid
## 2     1    63 -0.1942068        Jamal Zougam       29 Semaan Gaby Eid
## 3    12    63 -0.1960476 Abu Musad Alsakaoui       10 Semaan Gaby Eid
## 4    40    63 -0.1993567   Abdeluahid Berrak       11 Semaan Gaby Eid
## 5     3    63 -0.2186105      Mohamed Chaoui       27 Semaan Gaby Eid
## 6     7    63 -0.2264384  Imad Eddin Barakat       22 Semaan Gaby Eid
## # ... with 1 more variables: degree.y <dbl>
# plot similarity graph
h2 = graph_from_data_frame(filter(sim_joint, weight >= 0.60), directed = FALSE, vertices = terrorists)
h2 = delete_vertices(h2, which(degree(h2) == 0))


ggraph(h2) + 
  geom_edge_link(aes(alpha = weight)) + 
  geom_node_point() +
  theme_graph()
## Using `nicely` as default layout

ggraph(h2) + 
  geom_edge_link(aes(alpha = weight)) + 
  geom_node_point(aes(size = strength)) +
  theme_graph()
## Using `nicely` as default layout

ggraph(h2) + 
  geom_edge_link(aes(alpha = weight)) + 
  geom_node_point() +
  geom_node_text(aes(label = name), repel=T) +
  theme_graph()
## Using `nicely` as default layout

V(h2)$betw = betweenness(h2)

ggraph(h2) + 
  geom_edge_link(aes(alpha = weight)) + 
  geom_node_point(aes(size = betw)) +
  theme_graph()
## Using `nicely` as default layout

# distance matrix
D = 1-S

# distance object
d = as.dist(D)

# average-linkage clustering method
cc = hclust(d, method = "average")

# plot dendrogram
plot(cc)

# draw blue borders around clusters
clusters.list = rect.hclust(cc, k = 4, border="blue")

dendrogram = as.dendrogram(cc)
ggraph(dendrogram, layout = 'dendrogram', circular = TRUE) + 
  geom_edge_diagonal() +
  geom_node_text(aes(filter = leaf, label = label, x = x*1.03, y=y*1.03), size = 3) +
  theme_graph()