Google BigQuery is a cloud-based big data analytics web service for processing very large read-only data sets. BigQuery was designed for analyzing data on the order of billions of rows, using a SQL-like syntax. It runs on the Google Cloud Storage infrastructure.

Here we show an example of using BigQuery from R using the bigrquery package. The example is adapted from the github of the package (read section Billing project to use the code below). We query the Ethereum dataset that contains data on the blockchain Ethereum. See the Kaggle page for more information about the dataset. Here are the collected datasets of this challenge.

Connect

library(bigrquery)
library(DBI)
library(dplyr)
library(ggplot2)
library(readr)
# add your project id: free limits are 1 TB of queries / 10 GB of storage.
billing = "your-project-id"

# create connection
con = dbConnect(
  bigrquery::bigquery(),
  project = "bigquery-public-data",
  dataset = "ethereum_blockchain",
  billing = billing
)

# list tables of dataset
dbListTables(con)

Which are the top smart contracts?

#
# SQL
#
sql = 
"SELECT contracts.address, COUNT(1) AS tx_count
FROM `bigquery-public-data.ethereum_blockchain.contracts` AS contracts
JOIN `bigquery-public-data.ethereum_blockchain.transactions` AS transactions ON (transactions.to_address = contracts.address)
WHERE contracts.is_erc721 = TRUE
GROUP BY contracts.address
ORDER BY tx_count DESC
LIMIT 10"

# run SQL query
dbGetQuery(con, sql, n = 10)


#
# DPLYR
#
# get data source 
contracts = tbl(con, "contracts")
transactions = tbl(con, "transactions")

# data sources are remote data frames, not  local ones
class(contracts)

# ERC721
topERC721_contracts = 
  contracts %>% 
  filter(is_erc721 == TRUE) %>% 
  inner_join(transactions, by = c("address" = "to_address")) %>% 
  group_by(address) %>% 
  summarise(tx_count = n()) %>% 
  arrange(desc(tx_count)) %>% 
  head(10) %>% 
  collect() # collect the result (move it from cloud to your computer's memory)

# topERC721_contracts is a local data frame
class(topERC721_contracts)
# save
write_csv(topERC721_contracts, "topERC721_contracts.csv")




# ERC20
topERC20_contracts = 
  contracts %>% 
  filter(is_erc20 == TRUE) %>% 
  inner_join(transactions, by = c("address" = "to_address")) %>% 
  group_by(address) %>% 
  summarise(tx_count = n(), tx_value = sum(value, na.rm = TRUE)) %>% 
  arrange(desc(tx_value)) %>% 
  head(10) %>% 
  collect()

write_csv(topERC20_contracts, "topERC20_contracts.csv")
(topERC721_contracts = read_csv("topERC721_contracts.csv"))
## # A tibble: 10 x 2
##    address                                    tx_count
##    <chr>                                         <dbl>
##  1 0x06012c8cf97bead5deae237070f9587f8e7a266d  3697191
##  2 0xd73be539d6b2076bab83ca6ba62dfe189abc6bbe   333802
##  3 0x06a6a7af298129e3a2ab396c9c06f91d3c54aba8   139107
##  4 0x1a94fce7ef36bc90959e206ba569a12afbc91ca1   133431
##  5 0x7fdcd2a1e52f10c28cb7732f46393e297ecadda1    98260
##  6 0x8c9b261faef3b3c2e64ab5e58e04615f8c788099    81943
##  7 0xd2f81cd7a20d60c0d558496c7169a20968389b40    58676
##  8 0xa98ad92a642570b83b369c4eb70efefe638bc895    42006
##  9 0xe58af3dcbe423c1e330f9d6e61cd47cb530239ac    36776
## 10 0x5f5ff74b198e5095069cf132d2523dbae516613f    36051
(topERC20_contracts = read_csv("topERC20_contracts.csv"))
## # A tibble: 10 x 3
##    address                                    tx_count tx_value
##    <chr>                                         <dbl>    <dbl>
##  1 0xbb9bc244d798123fde783fcc1c72d3bb8c189413   173537  1.20e25
##  2 0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2   352919  3.01e24
##  3 0xaf30d2a7e90d7dc361c8c4585e9bb7d2f6f15bc7    83301  9.97e23
##  4 0x0d8775f648430679a709e98d2b0cb6250d2887ef   787546  8.71e23
##  5 0xecf8f87f810ecf450940c9f60066b4a7a501d6a7    16898  4.20e23
##  6 0x960b236a07cf122663c4303350609a66a7b288c0   154842  3.60e23
##  7 0x96a65609a7b84e8842732deb08f56c3e21ac6f8a    96010  1.70e23
##  8 0xef68e7c694f40c8202821edf525de3782458639f   320372  1.24e23
##  9 0xe3818504c1b32bf1557b16c238b2e01fd3149c17   105995  9.60e22
## 10 0xac709fcb44a43c35f0da4e3163b117a17f3770f5     3800  9.56e22

Who are the top block miners?

blocks <- tbl(con, "blocks")
miners = blocks %>% 
  group_by(miner) %>% 
  summarize(block_count = n(), tx_count = sum(transaction_count, na.rm = TRUE)) %>% 
  arrange(desc(block_count)) %>% 
  head(1000) %>% 
  collect()

write_csv(miners, "miners.csv")
(miners = read_csv("miners.csv"))
## # A tibble: 1,000 x 3
##    miner                                      block_count  tx_count
##    <chr>                                            <dbl>     <dbl>
##  1 0xea674fdde714fd979de3edf0f56aa9716b898ec8     1399845 119827016
##  2 0x2a65aca4d5fc5b5c859090a6c34d164135398226      917768  17050862
##  3 0x52bc44d5378309ee2abf1539bf71de1b7d7be3b5      838666  54647379
##  4 0x829bd824b016326a401d083b33d092293333a830      663117  75059518
##  5 0x5a0b54d5dc17e0aadc383d2db43b0a0d3e029c4c      561487  62943023
##  6 0x61c808d82a3ac53231750dadc13c777b59310bd9      481983   2589277
##  7 0xb2930b35844a230f00e51431acae96fe543a0347      387044  37669065
##  8 0x4bb96091ee9d802ed039c4d1a5f6216f90f81b01      274651   6259412
##  9 0x1e9939daaad6924ad004c2560e90804164900341      182950   6274664
## 10 0xf8b483dba2c3b7176a3da549ad41a48bb3121069      162265    578116
## # … with 990 more rows
summary(miners$block_count)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##      37.0      64.0     142.0    7688.7     502.8 1399845.0
ggplot(filter(miners, block_count <= quantile(block_count, 0.75))) +
  geom_histogram(aes(x = block_count))

Who are the top senders, receivers and creators of SuperRare tokens?

The SuperRare contract has address 0x41a322b28d0ff354040e2cbc676f0320d8c8850d. The relevant actors are:

token_transfers =  tbl(con, "token_transfers")
SuperRareContract = "0x41a322b28d0ff354040e2cbc676f0320d8c8850d"

SuperRare = token_transfers %>% 
  filter(token_address == SuperRareContract) %>% 
  collect()

write_csv(SuperRare, "SuperRare.csv")
(SuperRare = read_csv("SuperRare.csv"))
## # A tibble: 5,531 x 9
##    token_address from_address to_address value transaction_hash log_index
##    <chr>         <chr>        <chr>      <dbl> <chr>                <dbl>
##  1 0x41a322b28d… 0xb308082a8… 0xf2939a3…  1549 0x4bef2f568a3df…        95
##  2 0x41a322b28d… 0xb308082a8… 0xf2939a3…  1603 0xe572e033a206c…       120
##  3 0x41a322b28d… 0xb308082a8… 0xf2939a3…  1777 0xdb25d91107376…       126
##  4 0x41a322b28d… 0x000000000… 0x4ad818a…  1910 0x8dfa63491a234…        56
##  5 0x41a322b28d… 0xd7e1bc51c… 0x9a3e204…  1878 0x7e5990f8745a8…        49
##  6 0x41a322b28d… 0x000000000… 0x4cc9402…   175 0x18306299adb35…       142
##  7 0x41a322b28d… 0x000000000… 0x865c2f8…  1383 0xcd9315c00f485…        67
##  8 0x41a322b28d… 0x39cc9c86e… 0x3768225…  1090 0x1c5125683fe17…        45
##  9 0x41a322b28d… 0xa02378ca1… 0x4a7765c…  1318 0xc4f56e0e2be4d…        68
## 10 0x41a322b28d… 0x000000000… 0x865c2f8…  1384 0x040419597437d…        48
## # … with 5,521 more rows, and 3 more variables: block_timestamp <dttm>,
## #   block_number <dbl>, block_hash <chr>
creators = 
  SuperRare %>% 
  filter(from_address == "0x0000000000000000000000000000000000000000") %>% 
  group_by(to_address) %>% 
  summarize(count = n()) %>% 
  arrange(desc(count))

senders = 
  SuperRare %>% 
  filter(from_address != "0x0000000000000000000000000000000000000000") %>% 
  group_by(from_address) %>% 
  summarize(count = n()) %>% 
  arrange(desc(count))

receivers = 
  SuperRare %>% 
  filter(from_address != "0x0000000000000000000000000000000000000000") %>% 
  group_by(to_address) %>% 
  summarize(count = n()) %>% 
  arrange(desc(count))


creators
## # A tibble: 132 x 2
##    to_address                                 count
##    <chr>                                      <int>
##  1 0x860c4604fe1125ea43f81e613e7afb2aa49546aa   315
##  2 0x7f67e80272d6349a70acd0a6d0dc169292431020   132
##  3 0x60d38778adbbeeac88f741b833cbb9877228eea0   107
##  4 0x77350e1152efd5f2d807a6124015c629a907155e    96
##  5 0xb3007ff3c3f40bdf0126fec7c8e43c3fc50ea813    91
##  6 0xcd815dd7ebfd7eb46fc2858f040296b654cd6b81    80
##  7 0x39cc9c86e67baf2129b80fe3414c397492ea8026    78
##  8 0xd7e1bc51cd3f30e21b17bab33d77078e3fb7cc26    78
##  9 0x21316e6a4f0af45e5f1503984e83b10c53b177d8    74
## 10 0x5b0d92e138422428d8e5afab8ee8f99ba9a4e613    74
## # … with 122 more rows
senders
## # A tibble: 171 x 2
##    from_address                               count
##    <chr>                                      <int>
##  1 0x860c4604fe1125ea43f81e613e7afb2aa49546aa   315
##  2 0xdc19d539369c3da9172de453c05bc503c9ac6bc2   182
##  3 0xb3007ff3c3f40bdf0126fec7c8e43c3fc50ea813    85
##  4 0x21316e6a4f0af45e5f1503984e83b10c53b177d8    81
##  5 0x39cc9c86e67baf2129b80fe3414c397492ea8026    74
##  6 0x77350e1152efd5f2d807a6124015c629a907155e    69
##  7 0x576a655161b5502dcf40602be1f3519a89b71658    54
##  8 0xf8b32d30ac6ab3030595432533d7836fd76b078d    53
##  9 0x01cb023186cab05220554ee75b4d69921dd051f1    49
## 10 0xd7e1bc51cd3f30e21b17bab33d77078e3fb7cc26    48
## # … with 161 more rows
receivers
## # A tibble: 555 x 2
##    to_address                                 count
##    <chr>                                      <int>
##  1 0x10017ca37b1257ac0771e24652aa28c758e378eb   269
##  2 0xdc19d539369c3da9172de453c05bc503c9ac6bc2   241
##  3 0xe25c73435702fed11e9c5584ce6efe7cbff71739   240
##  4 0x41a322b28d0ff354040e2cbc676f0320d8c8850d   132
##  5 0xbaba92b7822a56c05554ab5d1bc1d0b7e212499d    58
##  6 0x576a655161b5502dcf40602be1f3519a89b71658    53
##  7 0xb3007ff3c3f40bdf0126fec7c8e43c3fc50ea813    53
##  8 0x335e19b3f5dcfa2e398ee7d5ff430a1f8ccc88b2    50
##  9 0x337101def3eeb6f06e071efe02216274507937bb    47
## 10 0x6853a596d6d7264d3622546da3b891b6fe17eb82    45
## # … with 545 more rows

Which are the most traded tokens on SuperRare?

The value of an ERC721 token transfer is the id of the token.

tokens = 
  SuperRare %>% 
  filter(from_address != "0x0000000000000000000000000000000000000000") %>% 
  count(value) %>% 
  rename(token = value, count = n) %>% 
  arrange(desc(count))

tokens
## # A tibble: 1,903 x 2
##    token count
##    <dbl> <int>
##  1  1000     9
##  2   172     6
##  3   119     5
##  4    65     4
##  5   214     4
##  6   673     4
##  7   832     4
##  8  1309     4
##  9    59     3
## 10    77     3
## # … with 1,893 more rows
# primary market
primary = filter(tokens, count == 1)
# secondary market
secondary = filter(tokens, count > 1)

nrow(secondary)
## [1] 359
nrow(primary)
## [1] 1544
nrow(secondary) / nrow(tokens)
## [1] 0.1886495

Are block hashes random numbers?

blocks = tbl(con, "blocks")


# retrieve 1000 block hashes
blocks1000 = blocks %>%
  filter(number < 1000) %>% 
  arrange(number) %>% 
  select(number, timestamp, hash, transaction_count) %>% 
  collect() 

write_csv(block1000, "block1000.csv")
(blocks1000 = read_csv("blocks1000.csv"))
## # A tibble: 1,000 x 4
##    number timestamp           hash                         transaction_cou…
##     <dbl> <dttm>              <chr>                                   <dbl>
##  1      0 1970-01-01 00:00:00 0xd4e56740f876aef8c010b86a4…                0
##  2      1 2015-07-30 15:26:28 0x88e96d4537bea4d9c05d12549…                0
##  3      2 2015-07-30 15:26:57 0xb495a1d7e6663152ae92708da…                0
##  4      3 2015-07-30 15:27:28 0x3d6122660cc824376f11ee842…                0
##  5      4 2015-07-30 15:27:57 0x23adf5a3be0f5235b36941bcb…                0
##  6      5 2015-07-30 15:28:03 0xf37c632d361e0a93f08ba29b1…                0
##  7      6 2015-07-30 15:28:27 0x1f1aed8e3694a067496c248e6…                0
##  8      7 2015-07-30 15:28:30 0xe0c7c0b46e116b874354dce6f…                0
##  9      8 2015-07-30 15:28:32 0x2ce94342df186bab4165c268c…                0
## 10      9 2015-07-30 15:28:35 0x997e47bf4cac509c627753c06…                0
## # … with 990 more rows
# compute entropy of hashes of blocks
h = blocks1000$hash
# remove prefix
h = substring(h, 3, nchar(h))
# convert to binary
bin = unlist(lapply(h, BMS::hex2bin))
# how many 0s and 1s?
(t = table(bin) / length(bin))
## bin
##         0         1 
## 0.5001758 0.4998242
# entropy
-sum(t * log2(t))
## [1] 0.9999999