Google BigQuery is a cloud-based big data analytics web service for processing very large read-only data sets. BigQuery was designed for analyzing data on the order of billions of rows, using a SQL-like syntax. It runs on the Google Cloud Storage infrastructure.
Here we show an example of using BigQuery from R using the bigrquery package. The example is adapted from the github of the package (read section Billing project to use the code below). We query the Ethereum dataset that contains data on the blockchain Ethereum. See the Kaggle page for more information about the dataset. Here are the collected datasets of this challenge.
library(bigrquery)
library(DBI)
library(dplyr)
library(ggplot2)
library(readr)
# add your project id: free limits are 1 TB of queries / 10 GB of storage.
billing = "your-project-id"
# create connection
con = dbConnect(
bigrquery::bigquery(),
project = "bigquery-public-data",
dataset = "ethereum_blockchain",
billing = billing
)
# list tables of dataset
dbListTables(con)
#
# SQL
#
sql =
"SELECT contracts.address, COUNT(1) AS tx_count
FROM `bigquery-public-data.ethereum_blockchain.contracts` AS contracts
JOIN `bigquery-public-data.ethereum_blockchain.transactions` AS transactions ON (transactions.to_address = contracts.address)
WHERE contracts.is_erc721 = TRUE
GROUP BY contracts.address
ORDER BY tx_count DESC
LIMIT 10"
# run SQL query
dbGetQuery(con, sql, n = 10)
#
# DPLYR
#
# get data source
contracts = tbl(con, "contracts")
transactions = tbl(con, "transactions")
# data sources are remote data frames, not local ones
class(contracts)
# ERC721
topERC721_contracts =
contracts %>%
filter(is_erc721 == TRUE) %>%
inner_join(transactions, by = c("address" = "to_address")) %>%
group_by(address) %>%
summarise(tx_count = n()) %>%
arrange(desc(tx_count)) %>%
head(10) %>%
collect() # collect the result (move it from cloud to your computer's memory)
# topERC721_contracts is a local data frame
class(topERC721_contracts)
# save
write_csv(topERC721_contracts, "topERC721_contracts.csv")
# ERC20
topERC20_contracts =
contracts %>%
filter(is_erc20 == TRUE) %>%
inner_join(transactions, by = c("address" = "to_address")) %>%
group_by(address) %>%
summarise(tx_count = n(), tx_value = sum(value, na.rm = TRUE)) %>%
arrange(desc(tx_value)) %>%
head(10) %>%
collect()
write_csv(topERC20_contracts, "topERC20_contracts.csv")
(topERC721_contracts = read_csv("topERC721_contracts.csv"))
## # A tibble: 10 x 2
## address tx_count
## <chr> <dbl>
## 1 0x06012c8cf97bead5deae237070f9587f8e7a266d 3697191
## 2 0xd73be539d6b2076bab83ca6ba62dfe189abc6bbe 333802
## 3 0x06a6a7af298129e3a2ab396c9c06f91d3c54aba8 139107
## 4 0x1a94fce7ef36bc90959e206ba569a12afbc91ca1 133431
## 5 0x7fdcd2a1e52f10c28cb7732f46393e297ecadda1 98260
## 6 0x8c9b261faef3b3c2e64ab5e58e04615f8c788099 81943
## 7 0xd2f81cd7a20d60c0d558496c7169a20968389b40 58676
## 8 0xa98ad92a642570b83b369c4eb70efefe638bc895 42006
## 9 0xe58af3dcbe423c1e330f9d6e61cd47cb530239ac 36776
## 10 0x5f5ff74b198e5095069cf132d2523dbae516613f 36051
(topERC20_contracts = read_csv("topERC20_contracts.csv"))
## # A tibble: 10 x 3
## address tx_count tx_value
## <chr> <dbl> <dbl>
## 1 0xbb9bc244d798123fde783fcc1c72d3bb8c189413 173537 1.20e25
## 2 0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2 352919 3.01e24
## 3 0xaf30d2a7e90d7dc361c8c4585e9bb7d2f6f15bc7 83301 9.97e23
## 4 0x0d8775f648430679a709e98d2b0cb6250d2887ef 787546 8.71e23
## 5 0xecf8f87f810ecf450940c9f60066b4a7a501d6a7 16898 4.20e23
## 6 0x960b236a07cf122663c4303350609a66a7b288c0 154842 3.60e23
## 7 0x96a65609a7b84e8842732deb08f56c3e21ac6f8a 96010 1.70e23
## 8 0xef68e7c694f40c8202821edf525de3782458639f 320372 1.24e23
## 9 0xe3818504c1b32bf1557b16c238b2e01fd3149c17 105995 9.60e22
## 10 0xac709fcb44a43c35f0da4e3163b117a17f3770f5 3800 9.56e22
blocks <- tbl(con, "blocks")
miners = blocks %>%
group_by(miner) %>%
summarize(block_count = n(), tx_count = sum(transaction_count, na.rm = TRUE)) %>%
arrange(desc(block_count)) %>%
head(1000) %>%
collect()
write_csv(miners, "miners.csv")
(miners = read_csv("miners.csv"))
## # A tibble: 1,000 x 3
## miner block_count tx_count
## <chr> <dbl> <dbl>
## 1 0xea674fdde714fd979de3edf0f56aa9716b898ec8 1399845 119827016
## 2 0x2a65aca4d5fc5b5c859090a6c34d164135398226 917768 17050862
## 3 0x52bc44d5378309ee2abf1539bf71de1b7d7be3b5 838666 54647379
## 4 0x829bd824b016326a401d083b33d092293333a830 663117 75059518
## 5 0x5a0b54d5dc17e0aadc383d2db43b0a0d3e029c4c 561487 62943023
## 6 0x61c808d82a3ac53231750dadc13c777b59310bd9 481983 2589277
## 7 0xb2930b35844a230f00e51431acae96fe543a0347 387044 37669065
## 8 0x4bb96091ee9d802ed039c4d1a5f6216f90f81b01 274651 6259412
## 9 0x1e9939daaad6924ad004c2560e90804164900341 182950 6274664
## 10 0xf8b483dba2c3b7176a3da549ad41a48bb3121069 162265 578116
## # … with 990 more rows
summary(miners$block_count)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 37.0 64.0 142.0 7688.7 502.8 1399845.0
ggplot(filter(miners, block_count <= quantile(block_count, 0.75))) +
geom_histogram(aes(x = block_count))
The SuperRare contract has address 0x41a322b28d0ff354040e2cbc676f0320d8c8850d
. The relevant actors are:
0x0000000000000000000000000000000000000000
token_transfers = tbl(con, "token_transfers")
SuperRareContract = "0x41a322b28d0ff354040e2cbc676f0320d8c8850d"
SuperRare = token_transfers %>%
filter(token_address == SuperRareContract) %>%
collect()
write_csv(SuperRare, "SuperRare.csv")
(SuperRare = read_csv("SuperRare.csv"))
## # A tibble: 5,531 x 9
## token_address from_address to_address value transaction_hash log_index
## <chr> <chr> <chr> <dbl> <chr> <dbl>
## 1 0x41a322b28d… 0xb308082a8… 0xf2939a3… 1549 0x4bef2f568a3df… 95
## 2 0x41a322b28d… 0xb308082a8… 0xf2939a3… 1603 0xe572e033a206c… 120
## 3 0x41a322b28d… 0xb308082a8… 0xf2939a3… 1777 0xdb25d91107376… 126
## 4 0x41a322b28d… 0x000000000… 0x4ad818a… 1910 0x8dfa63491a234… 56
## 5 0x41a322b28d… 0xd7e1bc51c… 0x9a3e204… 1878 0x7e5990f8745a8… 49
## 6 0x41a322b28d… 0x000000000… 0x4cc9402… 175 0x18306299adb35… 142
## 7 0x41a322b28d… 0x000000000… 0x865c2f8… 1383 0xcd9315c00f485… 67
## 8 0x41a322b28d… 0x39cc9c86e… 0x3768225… 1090 0x1c5125683fe17… 45
## 9 0x41a322b28d… 0xa02378ca1… 0x4a7765c… 1318 0xc4f56e0e2be4d… 68
## 10 0x41a322b28d… 0x000000000… 0x865c2f8… 1384 0x040419597437d… 48
## # … with 5,521 more rows, and 3 more variables: block_timestamp <dttm>,
## # block_number <dbl>, block_hash <chr>
creators =
SuperRare %>%
filter(from_address == "0x0000000000000000000000000000000000000000") %>%
group_by(to_address) %>%
summarize(count = n()) %>%
arrange(desc(count))
senders =
SuperRare %>%
filter(from_address != "0x0000000000000000000000000000000000000000") %>%
group_by(from_address) %>%
summarize(count = n()) %>%
arrange(desc(count))
receivers =
SuperRare %>%
filter(from_address != "0x0000000000000000000000000000000000000000") %>%
group_by(to_address) %>%
summarize(count = n()) %>%
arrange(desc(count))
creators
## # A tibble: 132 x 2
## to_address count
## <chr> <int>
## 1 0x860c4604fe1125ea43f81e613e7afb2aa49546aa 315
## 2 0x7f67e80272d6349a70acd0a6d0dc169292431020 132
## 3 0x60d38778adbbeeac88f741b833cbb9877228eea0 107
## 4 0x77350e1152efd5f2d807a6124015c629a907155e 96
## 5 0xb3007ff3c3f40bdf0126fec7c8e43c3fc50ea813 91
## 6 0xcd815dd7ebfd7eb46fc2858f040296b654cd6b81 80
## 7 0x39cc9c86e67baf2129b80fe3414c397492ea8026 78
## 8 0xd7e1bc51cd3f30e21b17bab33d77078e3fb7cc26 78
## 9 0x21316e6a4f0af45e5f1503984e83b10c53b177d8 74
## 10 0x5b0d92e138422428d8e5afab8ee8f99ba9a4e613 74
## # … with 122 more rows
senders
## # A tibble: 171 x 2
## from_address count
## <chr> <int>
## 1 0x860c4604fe1125ea43f81e613e7afb2aa49546aa 315
## 2 0xdc19d539369c3da9172de453c05bc503c9ac6bc2 182
## 3 0xb3007ff3c3f40bdf0126fec7c8e43c3fc50ea813 85
## 4 0x21316e6a4f0af45e5f1503984e83b10c53b177d8 81
## 5 0x39cc9c86e67baf2129b80fe3414c397492ea8026 74
## 6 0x77350e1152efd5f2d807a6124015c629a907155e 69
## 7 0x576a655161b5502dcf40602be1f3519a89b71658 54
## 8 0xf8b32d30ac6ab3030595432533d7836fd76b078d 53
## 9 0x01cb023186cab05220554ee75b4d69921dd051f1 49
## 10 0xd7e1bc51cd3f30e21b17bab33d77078e3fb7cc26 48
## # … with 161 more rows
receivers
## # A tibble: 555 x 2
## to_address count
## <chr> <int>
## 1 0x10017ca37b1257ac0771e24652aa28c758e378eb 269
## 2 0xdc19d539369c3da9172de453c05bc503c9ac6bc2 241
## 3 0xe25c73435702fed11e9c5584ce6efe7cbff71739 240
## 4 0x41a322b28d0ff354040e2cbc676f0320d8c8850d 132
## 5 0xbaba92b7822a56c05554ab5d1bc1d0b7e212499d 58
## 6 0x576a655161b5502dcf40602be1f3519a89b71658 53
## 7 0xb3007ff3c3f40bdf0126fec7c8e43c3fc50ea813 53
## 8 0x335e19b3f5dcfa2e398ee7d5ff430a1f8ccc88b2 50
## 9 0x337101def3eeb6f06e071efe02216274507937bb 47
## 10 0x6853a596d6d7264d3622546da3b891b6fe17eb82 45
## # … with 545 more rows
The value of an ERC721 token transfer is the id of the token.
tokens =
SuperRare %>%
filter(from_address != "0x0000000000000000000000000000000000000000") %>%
count(value) %>%
rename(token = value, count = n) %>%
arrange(desc(count))
tokens
## # A tibble: 1,903 x 2
## token count
## <dbl> <int>
## 1 1000 9
## 2 172 6
## 3 119 5
## 4 65 4
## 5 214 4
## 6 673 4
## 7 832 4
## 8 1309 4
## 9 59 3
## 10 77 3
## # … with 1,893 more rows
# primary market
primary = filter(tokens, count == 1)
# secondary market
secondary = filter(tokens, count > 1)
nrow(secondary)
## [1] 359
nrow(primary)
## [1] 1544
nrow(secondary) / nrow(tokens)
## [1] 0.1886495
BMS::hex2bin
functionblocks = tbl(con, "blocks")
# retrieve 1000 block hashes
blocks1000 = blocks %>%
filter(number < 1000) %>%
arrange(number) %>%
select(number, timestamp, hash, transaction_count) %>%
collect()
write_csv(block1000, "block1000.csv")
(blocks1000 = read_csv("blocks1000.csv"))
## # A tibble: 1,000 x 4
## number timestamp hash transaction_cou…
## <dbl> <dttm> <chr> <dbl>
## 1 0 1970-01-01 00:00:00 0xd4e56740f876aef8c010b86a4… 0
## 2 1 2015-07-30 15:26:28 0x88e96d4537bea4d9c05d12549… 0
## 3 2 2015-07-30 15:26:57 0xb495a1d7e6663152ae92708da… 0
## 4 3 2015-07-30 15:27:28 0x3d6122660cc824376f11ee842… 0
## 5 4 2015-07-30 15:27:57 0x23adf5a3be0f5235b36941bcb… 0
## 6 5 2015-07-30 15:28:03 0xf37c632d361e0a93f08ba29b1… 0
## 7 6 2015-07-30 15:28:27 0x1f1aed8e3694a067496c248e6… 0
## 8 7 2015-07-30 15:28:30 0xe0c7c0b46e116b874354dce6f… 0
## 9 8 2015-07-30 15:28:32 0x2ce94342df186bab4165c268c… 0
## 10 9 2015-07-30 15:28:35 0x997e47bf4cac509c627753c06… 0
## # … with 990 more rows
# compute entropy of hashes of blocks
h = blocks1000$hash
# remove prefix
h = substring(h, 3, nchar(h))
# convert to binary
bin = unlist(lapply(h, BMS::hex2bin))
# how many 0s and 1s?
(t = table(bin) / length(bin))
## bin
## 0 1
## 0.5001758 0.4998242
# entropy
-sum(t * log2(t))
## [1] 0.9999999