Connect to SciDB

library(arrayop) # optional step to leave out the `arrayop::` prefix.
# First, load user name and token from an authentication file, which is also compatible with iquery.
auth = yaml::yaml.load_file("~/.scidb_auth")
# Then connect to scidb. Params host, username and token are all single strings and can be provided in any way
conn = db_connect(host = "localhost", username = auth[["user-name"]], token = auth[["user-password"]])

Once we have connected to scidb with db_connect, we can get save the connection instance as shown above. Or call arrayop::get_default_connection() anywhere in code, which always return the db connection.

identical(get_default_connection(), conn) # the exact same thing!
#> [1] TRUE
print(conn) # print out `username@host [scidb version]` if the connection is successful
#> ScidbConnection: root@127.0.0.1 [23.10.14]

Get array_op instances

We use the term array_op to denote two types of arrays:

  1. persistent arrays in SciDB, denoted by an array name, e.g. myNamespace.myArray.
  2. array operations, ie. any numbers of potentially nested operations on one or multiple arrays. E.g. filter on one array; or join two arrays then filter the joined.

Semantically, both types of arrays are the same, and support the same set of verbs. They also both have:

  1. one or more attributes
  2. zero or more dimensions

Regular arrays have at least one explicit dimensions. SciDB data frames have no explicit dimension; in SciDB versions through 22.5 there are two hidden dimensions whose names starts with a dollar sign ($), while in SciDB 23.10 and onwards there are simply no dimensions at all.

# Upload a data frame to scidb
arr_uploaded = conn$upload_df(CO2, name = "test_CO2") # datasets::CO2

# Or just create a new array
arr_created = conn$create_array(
  name = "test_CO2_empty",  
  schema_template = "<Plant:string,Type:string,Treatment:string,conc:double,uptake:double> [i]"
)
# Create a scidb data frame. Notice no dimension provided
arr_created_df = conn$create_array(
  name = "test_CO2_empty_df",  
  schema_template = "<Plant:string,Type:string,Treatment:string,conc:double,uptake:double>" # leave out any dimension
)

# Retrieve an existing array (the one we just uploaded)
arr_existing = conn$array("test_CO2")

# Compile data.frame into a scidb 'build' operation with a build dimension
arr_compiled = conn$compile_df(head(CO2,4))
# Compile to a scidb data frame without explicit dimensions
arr_compiled_df = conn$compile_df(head(CO2,4), as_scidb_data_frame = T)

# Create a transient array by AFL
arr_from_afl = conn$afl_expr(arr_existing$filter(conc > 800)$to_afl())

# Store an AFL statement into a persistent array
arr_stored = conn$afl_expr(
    # afl_str = "filter(test_CO2, conc > 800)" # equivalent to code below
    arr_existing$filter(conc > 800)$to_afl()
  )$persist(
    save_array_name = "test_CO2_stored", .gc = F # prevent from being destroyed by R's garbage collected
  )

Let’s inspect these arrays!

arr_uploaded$to_afl()
#> [1] "test_CO2"
arr_uploaded$to_schema_str()
#> [1] "<Plant:string,Type:string,Treatment:string,conc:double,uptake:double> [i=1:84:0:10000]"
arr_uploaded$is_persistent() # test if the afl is an array name, without checking in db
#> [1] TRUE
arr_uploaded$exists_persistent_array() # whether the persistent array exists in db (does check in db)
#> [1] TRUE
arr_uploaded$array_meta_data()
#>       name uaid aid
#> 1 test_CO2    1   2
#>                                                                                           schema
#> 1 test_CO2<Plant:string,Type:string,Treatment:string,conc:double,uptake:double> [i=1:84:0:10000]
#>   availability temporary namespace distribution etcomp
#> 1         TRUE     FALSE    public       hashed   none
arr_uploaded$summarize_array()
#>   inst attid att count bytes chunks min_count avg_count max_count min_bytes
#> 1    0     0 all    84  2135      6        84        84        84        48
#>   avg_bytes max_bytes
#> 1  355.8333       752
arr_uploaded$cell_count()
#> [1] 84

str(arr_created)
#> [1] "test_CO2_empty <Plant:string,Type:string,Treatment:string,conc:double,uptake:double> [i=0:*:0:*]"
str(arr_existing) # same as arr_uploaded
#> [1] "test_CO2 <Plant:string,Type:string,Treatment:string,conc:double,uptake:double> [i=1:84:0:10000]"

arr_compiled$to_afl()
#> [1] "build(<Plant:string,Type:string,Treatment:string,conc:double,uptake:double>[sWLkluueDl_], '[(\\'Qn1\\',\\'Quebec\\',\\'nonchilled\\', 95,16.0),(\\'Qn1\\',\\'Quebec\\',\\'nonchilled\\',175,30.4),(\\'Qn1\\',\\'Quebec\\',\\'nonchilled\\',250,34.8),(\\'Qn1\\',\\'Quebec\\',\\'nonchilled\\',350,37.2)]', true)"
arr_compiled$to_schema_str()
#> [1] "<Plant:string,Type:string,Treatment:string,conc:double,uptake:double> [sWLkluueDl_=0:*:0:1000000]"
arr_compiled$is_persistent()
#> [1] FALSE

# SciDB data frames
arr_created_df$to_schema_str()
#> [1] "<Plant:string,Type:string,Treatment:string,conc:double,uptake:double>"
arr_compiled_df$to_schema_str()
#> [1] "<Plant:string,Type:string,Treatment:string,conc:double,uptake:double>"

arr_from_afl$to_afl()
#> [1] "filter(test_CO2,conc > 800)"
arr_from_afl$cell_count()
#> [1] 12

str(arr_stored)
#> [1] "test_CO2_stored <Plant:string,Type:string,Treatment:string,conc:double,uptake:double> [i=1:84:0:10000]"
arr_stored$cell_count()
#> [1] 12

Download array_op as data.frame

Call to_df method on any array_op instance to get a data frame with all its attributes; or to_df_all to get all fields (both dimensions and attributes).

arr_stored$to_df()
#>    Plant        Type  Treatment conc uptake
#> 1    Qn1      Quebec nonchilled 1000   39.7
#> 2    Qn2      Quebec nonchilled 1000   44.3
#> 3    Qn3      Quebec nonchilled 1000   45.5
#> 4    Qc1      Quebec    chilled 1000   38.7
#> 5    Qc2      Quebec    chilled 1000   42.4
#> 6    Qc3      Quebec    chilled 1000   41.4
#> 7    Mn1 Mississippi nonchilled 1000   35.5
#> 8    Mn2 Mississippi nonchilled 1000   31.5
#> 9    Mn3 Mississippi nonchilled 1000   27.8
#> 10   Mc1 Mississippi    chilled 1000   21.9
#> 11   Mc2 Mississippi    chilled 1000   14.4
#> 12   Mc3 Mississippi    chilled 1000   19.9
arr_stored$to_df_all() # Notice the dimension column [i]
#>     i Plant        Type  Treatment conc uptake
#> 1   7   Qn1      Quebec nonchilled 1000   39.7
#> 2  14   Qn2      Quebec nonchilled 1000   44.3
#> 3  21   Qn3      Quebec nonchilled 1000   45.5
#> 4  28   Qc1      Quebec    chilled 1000   38.7
#> 5  35   Qc2      Quebec    chilled 1000   42.4
#> 6  42   Qc3      Quebec    chilled 1000   41.4
#> 7  49   Mn1 Mississippi nonchilled 1000   35.5
#> 8  56   Mn2 Mississippi nonchilled 1000   31.5
#> 9  63   Mn3 Mississippi nonchilled 1000   27.8
#> 10 70   Mc1 Mississippi    chilled 1000   21.9
#> 11 77   Mc2 Mississippi    chilled 1000   14.4
#> 12 84   Mc3 Mississippi    chilled 1000   19.9

Common array operations

We can create array_ops on the fly and download them as data frames. Each array operation also returns an array_op instance. So we can chain up the operations in a dplyr like data flow. All the array operation verbs are named following dplyr convention where appropriate, e.g. filter, inner_join, mutate, group_by and summarize etc.

Filter

# Comma separated expressions are joined with logical AND
arr_existing$filter(conc > 80, uptake > 40)$to_df()
#>    Plant   Type  Treatment conc uptake
#> 1    Qn2 Quebec nonchilled  350   41.8
#> 2    Qn2 Quebec nonchilled  500   40.6
#> 3    Qn2 Quebec nonchilled  675   41.4
#> 4    Qn2 Quebec nonchilled 1000   44.3
#> 5    Qn3 Quebec nonchilled  250   40.3
#> 6    Qn3 Quebec nonchilled  350   42.1
#> 7    Qn3 Quebec nonchilled  500   42.9
#> 8    Qn3 Quebec nonchilled  675   43.9
#> 9    Qn3 Quebec nonchilled 1000   45.5
#> 10   Qc2 Quebec    chilled 1000   42.4
#> 11   Qc3 Quebec    chilled 1000   41.4
# Equivalent to above
# arr_existing$filter(conc > 80 & uptake > 40)$to_df()

arr_existing$filter(conc <= 100 | conc > 900)$to_df() # logical OR
#>    Plant        Type  Treatment conc uptake
#> 1    Qn1      Quebec nonchilled   95   16.0
#> 2    Qn1      Quebec nonchilled 1000   39.7
#> 3    Qn2      Quebec nonchilled   95   13.6
#> 4    Qn2      Quebec nonchilled 1000   44.3
#> 5    Qn3      Quebec nonchilled   95   16.2
#> 6    Qn3      Quebec nonchilled 1000   45.5
#> 7    Qc1      Quebec    chilled   95   14.2
#> 8    Qc1      Quebec    chilled 1000   38.7
#> 9    Qc2      Quebec    chilled   95    9.3
#> 10   Qc2      Quebec    chilled 1000   42.4
#> 11   Qc3      Quebec    chilled   95   15.1
#> 12   Qc3      Quebec    chilled 1000   41.4
#> 13   Mn1 Mississippi nonchilled   95   10.6
#> 14   Mn1 Mississippi nonchilled 1000   35.5
#> 15   Mn2 Mississippi nonchilled   95   12.0
#> 16   Mn2 Mississippi nonchilled 1000   31.5
#> 17   Mn3 Mississippi nonchilled   95   11.3
#> 18   Mn3 Mississippi nonchilled 1000   27.8
#> 19   Mc1 Mississippi    chilled   95   10.5
#> 20   Mc1 Mississippi    chilled 1000   21.9
#> 21   Mc2 Mississippi    chilled   95    7.7
#> 22   Mc2 Mississippi    chilled 1000   14.4
#> 23   Mc3 Mississippi    chilled   95   10.6
#> 24   Mc3 Mississippi    chilled 1000   19.9

# Filter on string fields
# %like% takes regex as the right operand; all other verbs treat strings literally.
arr_existing$filter(Plant %contains% "3", Plant %starts_with% "Q", Treatment %contains% "chill", Treatment %like% "chill[a-z]{2}$")$to_df()
#>   Plant   Type Treatment conc uptake
#> 1   Qc3 Quebec   chilled   95   15.1
#> 2   Qc3 Quebec   chilled  175   21.0
#> 3   Qc3 Quebec   chilled  250   38.1
#> 4   Qc3 Quebec   chilled  350   34.0
#> 5   Qc3 Quebec   chilled  500   38.9
#> 6   Qc3 Quebec   chilled  675   39.6
#> 7   Qc3 Quebec   chilled 1000   41.4

To use a variable in a filter expression, prepend !!, as shown below

min_conc <- 80
arr_existing$filter(conc > !!min_conc, uptake > 40)$to_df()
#>    Plant   Type  Treatment conc uptake
#> 1    Qn2 Quebec nonchilled  350   41.8
#> 2    Qn2 Quebec nonchilled  500   40.6
#> 3    Qn2 Quebec nonchilled  675   41.4
#> 4    Qn2 Quebec nonchilled 1000   44.3
#> 5    Qn3 Quebec nonchilled  250   40.3
#> 6    Qn3 Quebec nonchilled  350   42.1
#> 7    Qn3 Quebec nonchilled  500   42.9
#> 8    Qn3 Quebec nonchilled  675   43.9
#> 9    Qn3 Quebec nonchilled 1000   45.5
#> 10   Qc2 Quebec    chilled 1000   42.4
#> 11   Qc3 Quebec    chilled 1000   41.4

Mutate/Transmute

By expressions

As shown below, field names can be either quoted or not quoted. But the field expression must be a single string. Unlike R, SciDB uses single quotes for string literals. E.g. 'at' is single-quoted.

Like filter expressions, use !! to reference variables in memory.

# Modify existing fields
result = arr_stored$mutate(Type = 'at_' + Type, conc = conc/1000)
result$to_afl()
#> [1] "project(project(apply(project(apply(test_CO2_stored,_Type,'at_' + Type,_conc,conc / 1000),Type,conc,inverse:true),Type,_Type,conc,_conc),_Type,_conc,inverse:true),Plant,Type,Treatment,conc,uptake)"
result$to_df()
#>    Plant           Type  Treatment conc uptake
#> 1    Qn1      at_Quebec nonchilled    1   39.7
#> 2    Qn2      at_Quebec nonchilled    1   44.3
#> 3    Qn3      at_Quebec nonchilled    1   45.5
#> 4    Qc1      at_Quebec    chilled    1   38.7
#> 5    Qc2      at_Quebec    chilled    1   42.4
#> 6    Qc3      at_Quebec    chilled    1   41.4
#> 7    Mn1 at_Mississippi nonchilled    1   35.5
#> 8    Mn2 at_Mississippi nonchilled    1   31.5
#> 9    Mn3 at_Mississippi nonchilled    1   27.8
#> 10   Mc1 at_Mississippi    chilled    1   21.9
#> 11   Mc2 at_Mississippi    chilled    1   14.4
#> 12   Mc3 at_Mississippi    chilled    1   19.9

# Add new fields
result = arr_stored$mutate(Plant_Type = Plant + '-' + Type, feel = if(Treatment == 'chilled') 'cold' else 'ok')

# Same as above. Notice the `iif` operator is explicitly written here, 
# equivalent to native R `if` statement
# result = arr_stored$mutate(Plant_Type = Plant + '-' + Type, feel = iif(Treatment == 'chilled', 'cold', 'ok'))
result$to_afl()
#> [1] "apply(test_CO2_stored,Plant_Type,(Plant + '-') + Type,feel,iif(Treatment = 'chilled', 'cold', 'ok'))"
result$to_df()
#>    Plant        Type  Treatment conc uptake      Plant_Type feel
#> 1    Qn1      Quebec nonchilled 1000   39.7      Qn1-Quebec   ok
#> 2    Qn2      Quebec nonchilled 1000   44.3      Qn2-Quebec   ok
#> 3    Qn3      Quebec nonchilled 1000   45.5      Qn3-Quebec   ok
#> 4    Qc1      Quebec    chilled 1000   38.7      Qc1-Quebec cold
#> 5    Qc2      Quebec    chilled 1000   42.4      Qc2-Quebec cold
#> 6    Qc3      Quebec    chilled 1000   41.4      Qc3-Quebec cold
#> 7    Mn1 Mississippi nonchilled 1000   35.5 Mn1-Mississippi   ok
#> 8    Mn2 Mississippi nonchilled 1000   31.5 Mn2-Mississippi   ok
#> 9    Mn3 Mississippi nonchilled 1000   27.8 Mn3-Mississippi   ok
#> 10   Mc1 Mississippi    chilled 1000   21.9 Mc1-Mississippi cold
#> 11   Mc2 Mississippi    chilled 1000   14.4 Mc2-Mississippi cold
#> 12   Mc3 Mississippi    chilled 1000   19.9 Mc3-Mississippi cold

mutate operation keeps the original fields unless we modify them.

transmute allows us to explicitly specify which fields to keep.

# strings are escaped for AFL 
result = arr_stored$transmute(Plant, Plant_Type = Plant + "'-'" + Type, feel = iif(Treatment == 'chilled', 'c"ooo"ld', "OK"))
result$to_afl()
#> [1] "project(apply(test_CO2_stored,Plant_Type,(Plant + '\\'-\\'') + Type,feel,iif(Treatment = 'chilled', 'c\"ooo\"ld', 'OK')),Plant,Plant_Type,feel)"
result$to_df()
#>    Plant        Plant_Type     feel
#> 1    Qn1      Qn1'-'Quebec       OK
#> 2    Qn2      Qn2'-'Quebec       OK
#> 3    Qn3      Qn3'-'Quebec       OK
#> 4    Qc1      Qc1'-'Quebec c"ooo"ld
#> 5    Qc2      Qc2'-'Quebec c"ooo"ld
#> 6    Qc3      Qc3'-'Quebec c"ooo"ld
#> 7    Mn1 Mn1'-'Mississippi       OK
#> 8    Mn2 Mn2'-'Mississippi       OK
#> 9    Mn3 Mn3'-'Mississippi       OK
#> 10   Mc1 Mc1'-'Mississippi c"ooo"ld
#> 11   Mc2 Mc2'-'Mississippi c"ooo"ld
#> 12   Mc3 Mc3'-'Mississippi c"ooo"ld

By arrays

mutate and transmute are useful if the mutated fields are in expressions. Another common use case is mutating fields of an existing array by another array, rather than by expressions.

dataArray = conn$array_from_df(data.frame(Plant = c("Qn1", "Qn2", "Qn3"), Treatment = "treated-mutated"))
result = arr_stored$mutate_by(dataArray, keys = "Plant", updated_fields = "Treatment")
result$to_afl()
#> [1] "project(join(project(redimension(equi_join(project(build(<Plant:string,Treatment:string>[IEEX_], '[(\\'Qn1\\',\\'treated-mutated\\'),(\\'Qn2\\',\\'treated-mutated\\'),(\\'Qn3\\',\\'treated-mutated\\')]', true),Treatment,Plant) as _L, project(apply(test_CO2_stored,i,i),i,Plant) as _R, left_names:_L.Plant, right_names:_R.Plant),<Treatment:string> [i=1:84:0:10000]),Treatment),project(test_CO2_stored,Plant,Type,conc,uptake)),Plant,Type,Treatment,conc,uptake)"
result$to_df()
#>   Plant   Type       Treatment conc uptake
#> 1   Qn1 Quebec treated-mutated 1000   39.7
#> 2   Qn2 Quebec treated-mutated 1000   44.3
#> 3   Qn3 Quebec treated-mutated 1000   45.5

Array Mutability

NOTE: the source arrays remain the same in examples above. All mutate operations just created mutated data copies of the source.

To modify array content, we can update or overwrite the source array with mutated data copies.

dataArray = conn$array_from_df(data.frame(Plant = c("Qn1", "Qn2", "Qn3"), Treatment = "treated-mutated"))
mutated = arr_stored$mutate_by(dataArray, keys = "Plant", updated_fields = "Treatment")

# Persist data changes to an existing array
mutated$update(arr_stored)$execute()
arr_stored$to_df() # Only matched data cells are updated
#>    Plant        Type       Treatment conc uptake
#> 1    Qn1      Quebec treated-mutated 1000   39.7
#> 2    Qn2      Quebec treated-mutated 1000   44.3
#> 3    Qn3      Quebec treated-mutated 1000   45.5
#> 4    Qc1      Quebec         chilled 1000   38.7
#> 5    Qc2      Quebec         chilled 1000   42.4
#> 6    Qc3      Quebec         chilled 1000   41.4
#> 7    Mn1 Mississippi      nonchilled 1000   35.5
#> 8    Mn2 Mississippi      nonchilled 1000   31.5
#> 9    Mn3 Mississippi      nonchilled 1000   27.8
#> 10   Mc1 Mississippi         chilled 1000   21.9
#> 11   Mc2 Mississippi         chilled 1000   14.4
#> 12   Mc3 Mississippi         chilled 1000   19.9

mutated$overwrite(arr_stored)$execute()
arr_stored$to_df() # Caution: Overwrite all array content!!!
#>   Plant   Type       Treatment conc uptake
#> 1   Qn1 Quebec treated-mutated 1000   39.7
#> 2   Qn2 Quebec treated-mutated 1000   44.3
#> 3   Qn3 Quebec treated-mutated 1000   45.5

Join

Four joins are supported: inner_join, left_join, right_join, and full_join.

First, let’s populate two arrays.

leftDf = data.frame(lda = 1:5, db = 11:15, fa = LETTERS[1:5],  lfb = 3.14 * 1:5)
rightDf = data.frame(rda = 3:10, db = 13:20, fa = LETTERS[3:10], rfb = 3.14 * 3:10)

leftDf
#>   lda db fa   lfb
#> 1   1 11  A  3.14
#> 2   2 12  B  6.28
#> 3   3 13  C  9.42
#> 4   4 14  D 12.56
#> 5   5 15  E 15.70
rightDf
#>   rda db fa   rfb
#> 1   3 13  C  9.42
#> 2   4 14  D 12.56
#> 3   5 15  E 15.70
#> 4   6 16  F 18.84
#> 5   7 17  G 21.98
#> 6   8 18  H 25.12
#> 7   9 19  I 28.26
#> 8  10 20  J 31.40

# At least one input to 'cross_join' must have a specified chunk size.
# So we need to persist the arrays
L = conn$array_from_df(leftDf, "<fa:string, lfb:double> [lda;db] ", force_template_schema = T)$persist(.gc = F)
R = conn$array_from_df(rightDf, "<fa:string, rfb:double> [rda;db] ", force_template_schema = T)$persist(.gc = F)

Then, start with a vanilla join where join keys are inferred from array fields. In this case, fa and db are join keys since they exist in both arrays.

L$inner_join(R)$to_df()
#>   lda db fa   lfb rda   rfb
#> 1   3 13  C  9.42   3  9.42
#> 2   4 14  D 12.56   4 12.56
#> 3   5 15  E 15.70   5 15.70
L$left_join(R)$to_df()
#>   lda db fa   lfb rda   rfb
#> 1   3 13  C  9.42   3  9.42
#> 2   1 11  A  3.14  NA    NA
#> 3   4 14  D 12.56   4 12.56
#> 4   5 15  E 15.70   5 15.70
#> 5   2 12  B  6.28  NA    NA
L$right_join(R)$to_df()
#>   lda db fa   lfb rda   rfb
#> 1  NA 18  H    NA   8 25.12
#> 2   3 13  C  9.42   3  9.42
#> 3  NA 20  J    NA  10 31.40
#> 4   4 14  D 12.56   4 12.56
#> 5  NA 16  F    NA   6 18.84
#> 6  NA 17  G    NA   7 21.98
#> 7  NA 19  I    NA   9 28.26
#> 8   5 15  E 15.70   5 15.70
L$full_join(R)$to_df()
#>    lda db fa   lfb rda   rfb
#> 1    1 11  A  3.14  NA    NA
#> 2    3 13  C  9.42   3  9.42
#> 3   NA 18  H    NA   8 25.12
#> 4    4 14  D 12.56   4 12.56
#> 5   NA 16  F    NA   6 18.84
#> 6   NA 17  G    NA   7 21.98
#> 7   NA 20  J    NA  10 31.40
#> 8    5 15  E 15.70   5 15.70
#> 9   NA 19  I    NA   9 28.26
#> 10   2 12  B  6.28  NA    NA

We can specify which fields to keep with array_op$select. By default, all fields are selected, except that the join keys only exist once in the result even if they are selected on both operands.

L$select("lfb", "lda")$
  inner_join(R)$to_df()
#>     lfb lda rda   rfb
#> 1  9.42   3   3  9.42
#> 2 12.56   4   4 12.56
#> 3 15.70   5   5 15.70

L$inner_join(R$select("rda", "rfb"))$to_df()
#>   lda db fa   lfb rda   rfb
#> 1   3 13  C  9.42   3  9.42
#> 2   4 14  D 12.56   4 12.56
#> 3   5 15  E 15.70   5 15.70

L$select("lfb", "db")$inner_join(R$select("rfb"))$to_df()
#>     lfb db   rfb
#> 1  9.42 13  9.42
#> 2 12.56 14 12.56
#> 3 15.70 15 15.70

We can specify the join keys and aliases explicitly. Array aliases are important to disambiguate overlapping fields.

# `db` as join key, `fa` exists in both arrays and will be suffixed with array aliases in the result
L$left_join(R, by = c('db'), left_alias = "_LL", right_alias = "_RR")$to_df()
#>   lda db fa_LL   lfb rda fa_RR   rfb
#> 1   5 15     E 15.70   5     E 15.70
#> 2   2 12     B  6.28  NA  <NA>    NA
#> 3   3 13     C  9.42   3     C  9.42
#> 4   1 11     A  3.14  NA  <NA>    NA
#> 5   4 14     D 12.56   4     D 12.56

# different join keys
L$right_join(R, by.x = "lda", by.y = "rda", left_alias = "_LL", right_alias = "_RR")$to_df()
#>   lda db_LL fa_LL   lfb db_RR fa_RR   rfb
#> 1   3    13     C  9.42    13     C  9.42
#> 2   7    NA  <NA>    NA    17     G 21.98
#> 3   8    NA  <NA>    NA    18     H 25.12
#> 4   6    NA  <NA>    NA    16     F 18.84
#> 5  10    NA  <NA>    NA    20     J 31.40
#> 6   4    14     D 12.56    14     D 12.56
#> 7   9    NA  <NA>    NA    19     I 28.26
#> 8   5    15     E 15.70    15     E 15.70

Join mode

By default, joins are implemented by the SciDB equi_join operator, which works both when the join keys are array dimensions and when they are attributes, or a mix of the two. However, the drawback of equi_join is that it materializes its results, which can be undesirable in a complex nested join.

Two other join modes which avoid materialization, apply_join and cross_join, are available in some cases but have other restrictions.

apply_join mode

The apply_join mode is available for inner_join and left_join; it is not an option for the right_join operator but could be used by reordering the inputs to express the right outer join as a left outer join. Like equi_join, the join keys for apply_join may be a mix of array dimensions and attributes. Unlike equi_join, it does not materialize its results, and the returned array uses the same dimensions as the input left-hand array, and so it has the same schema as the left-hand array up to the addition of new attributes (or, in the case of left outer joins, adding nullable flags to the current attributes). However, it does so at the cost of replicating the right-hand array to all the SciDB instances, and so is inappropriate for very large right-hand inputs.

L$inner_join(R, join_mode='apply_join')$to_df_all()
#>   lda db fa   lfb rda   rfb
#> 1   3 13  C  9.42   3  9.42
#> 2   4 14  D 12.56   4 12.56
#> 3   5 15  E 15.70   5 15.70
L$left_join(R, join_mode='apply_join')$to_df_all()
#>   lda db fa   lfb rda   rfb
#> 1   1 11  A  3.14  NA    NA
#> 2   2 12  B  6.28  NA    NA
#> 3   3 13  C  9.42   3  9.42
#> 4   4 14  D 12.56   4 12.56
#> 5   5 15  E 15.70   5 15.70

Note that, unlike equi_join, the dimensions of the left-hand input do not become attributes of the output, and so they need to be explicitly selected to be returned when using to_df() and not to_df_all(), as shown above. A second restriction is that apply_join has non-standard behavior for a join when there multiple matches for the keys on the right-hand input. Because the output uses the dimensions of the left-hand input, it cannot contain more than one record per original left-hand record; so, when there are multiple matches from the right-hand side, only one is chosen arbitrarily to populate the joined attributes of the output.

cross_join mode

The cross_join mode is available for inner_join, but no other join. This mode does not materialize results, but is valid only when the join keys are dimensions for both the left-hand and right-hand arrays.

L$inner_join(R, 
             by.x = c('db', 'lda'), 
             by.y = c('db', 'rda'), 
             join_mode = 'cross_join', left_alias = "_LL", right_alias = "_RR")$
  to_df()
#>   lda db fa_LL   lfb fa_RR   rfb
#> 1   3 13     C  9.42     C  9.42
#> 2   4 14     D 12.56     D 12.56
#> 3   5 15     E 15.70     E 15.70

When the join keys are a strict subset of the dimensions of the right-hand array, the output contains, as expected, the Cartesian product of the matched records.

Custom settings

result = L$inner_join(R, 
             settings = list(
               algorithm = "'hash_replicate_right'", # string setting item
               hash_join_threshold = 1024 # numeric setting item
         ))
result$to_afl()
#> [1] "equi_join(project(apply(Rarrayop_375733fbcfb3,lda,lda),lda,fa,lfb) as _L, project(apply(Rarrayop_3757609a5129,rda,rda),rda,fa,rfb) as _R, left_names:(_L.db,_L.fa), right_names:(_R.db,_R.fa), algorithm:'hash_replicate_right', hash_join_threshold:1024)"
result$to_df()
#>   lda db fa   lfb rda   rfb
#> 1   3 13  C  9.42   3  9.42
#> 2   4 14  D 12.56   4 12.56
#> 3   5 15  E 15.70   5 15.70

Populate fields automatically

In many SQL dialects, we can define an auto incremented field. When new records are inserted, the field is auto incremented. SciDB currently does not support this feature.

Given a data source array_op (normally created from R data frame or conn$fread), and a target template array which we insert data into, we can achieve this feature in two steps:

  1. populate the auto incremented fields of the source array_op using stats from the template target array.
  2. insert the updated source into the target array

Let’s see an example. First, create a target array with 3 attrs and 2 dimensions; and a source array with some data. Notice we set the source array’s build_dim_spec = 'z' so that we can refer to it later.

AutoFieldsContent = data.frame(da=1:10, db=1:10, f_str = letters[1:10], f_int32=1:10, f_int64=11:20)
target = conn$array_from_df(AutoFieldsContent, 
                            template = "<f_str:string, f_int32:int32, f_int64:int64> [da=0:*:0:1000; db=0:*:0:1000]", 
                            force_template_schema = T)$
              persist(.temp=T, .gc = F)
empty_target = conn$create_array(dbutils$random_array_name(), target, .temp = T)
source =  conn$compile_df(data.frame(f_str = LETTERS[1:3]), template = target, build_dim_spec = "z")
source$set_auto_fields(target)$to_df() # same as below
#>   f_str da db f_int32 f_int64
#> 1     A 11 11      11      21
#> 2     B 12 12      12      22
#> 3     C 13 13      13      23
# source$set_auto_fields(target, source_auto_increment=c(z=0), target_auto_increment=c(da=0, db=0, f_int32=0))$to_df()

# # target_auto_increment does not matter if the target array is not empty
source$set_auto_fields(target, source_auto_increment=c(z=0), target_auto_increment=c(da=1, db=2, f_int32=3))$to_df()
#>   f_str da db f_int32
#> 1     A 11 11      11
#> 2     B 12 12      12
#> 3     C 13 13      13

# # target_auto_increment affects the initial field values if the target array is empty
source$set_auto_fields(empty_target, source_auto_increment=c(z=0), target_auto_increment=c(da=1, db=2, f_int32=3))$to_df()
#>   f_str da db f_int32
#> 1     A  1  2       3
#> 2     B  2  3       4
#> 3     C  3  4       5

# By default, both source and target auto increment start from 0
# The only dimension in source is used as the source_auto_increment field; otherwise, we need to specify the source field
# All the missing fields are assumed the target_auto_increment fields unless provided otherwise.
source$set_auto_fields(empty_target)$to_df()
#>   f_str da db f_int32 f_int64
#> 1     A  0  0       0       0
#> 2     B  1  1       1       1
#> 3     C  2  2       2       2
source$set_auto_fields(empty_target, target_auto_increment = c("da", "db"))$to_df()
#>   f_str da db
#> 1     A  0  0
#> 2     B  1  1
#> 3     C  2  2

In a less common case, we may need to populate a dimension field to avoid cell coordinate collisions.

AutoFieldsContent
#>    da db f_str f_int32 f_int64
#> 1   1  1     a       1      11
#> 2   2  2     b       2      12
#> 3   3  3     c       3      13
#> 4   4  4     d       4      14
#> 5   5  5     e       5      15
#> 6   6  6     f       6      16
#> 7   7  7     g       7      17
#> 8   8  8     h       8      18
#> 9   9  9     i       9      19
#> 10 10 10     j      10      20
conn$
    compile_df(data.frame(f_str = LETTERS[4:10], f_int32 = 4:10, da = c(1,2,1,2,1,4,5)), template = target)$
    set_auto_fields(target,
                    anti_collision_field = 'db')$
    to_df() |> dplyr::arrange(da, db)
#>   da db f_str f_int32
#> 1  1  2     D       4
#> 2  1  3     F       6
#> 3  1  4     H       8
#> 4  2  3     E       5
#> 5  2  4     G       7
#> 6  4  5     I       9
#> 7  5  6     J      10
conn$
    compile_df(data.frame(f_str = LETTERS[4:10], f_int32 = 4:10, da = c(1,2,1,2,1,4,5)), template = empty_target)$
    set_auto_fields(empty_target,
                    anti_collision_field = 'db')$
    to_df() |> dplyr::arrange(da, db)
#>   da db f_str f_int32
#> 1  1  0     D       4
#> 2  1  1     F       6
#> 3  1  2     H       8
#> 4  2  0     E       5
#> 5  2  1     G       7
#> 6  4  0     I       9
#> 7  5  0     J      10

Notice there are no duplicated tuples of (da, db) so they can be safely inserted into the target array without coordinate collisions.

We can combine the two types of auto fields in one run and update the target array.

conn$
    compile_df(data.frame(f_str = LETTERS[4:10], da = c(1,2,1,2,1,4,5)), template = target, build_dim_spec = "z")$
    set_auto_fields(target,
                    source_auto_increment = "z",
                    target_auto_increment = "f_int64",
                    anti_collision_field = 'db')$
    mutate(f_int32 = int32(null))$
    change_schema(target)$
    update(target)$
    execute()

target$to_df_all()
#>    da db f_str f_int32 f_int64
#> 1   1  1     a       1      11
#> 2   1  2     D      NA      21
#> 3   1  3     F      NA      23
#> 4   1  4     H      NA      25
#> 5   2  2     b       2      12
#> 6   2  3     E      NA      22
#> 7   2  4     G      NA      24
#> 8   3  3     c       3      13
#> 9   4  4     d       4      14
#> 10  4  5     I      NA      26
#> 11  5  5     e       5      15
#> 12  5  6     J      NA      27
#> 13  6  6     f       6      16
#> 14  7  7     g       7      17
#> 15  8  8     h       8      18
#> 16  9  9     i       9      19
#> 17 10 10     j      10      20
conn$
    compile_df(data.frame(f_str = LETTERS[4:10], da = c(1,2,1,2,1,4,5)), template = target, build_dim_spec = "z")$
    set_auto_fields(empty_target,
                    source_auto_increment = "z",
                    target_auto_increment = "f_int64",
                    anti_collision_field = 'db')$
    mutate(f_int32 = int32(null))$
    change_schema(empty_target)$
    update(empty_target)$
    execute()

empty_target$to_df_all()
#>   da db f_str f_int32 f_int64
#> 1  1  0     D      NA       0
#> 2  1  1     F      NA       2
#> 3  1  2     H      NA       4
#> 4  2  0     E      NA       1
#> 5  2  1     G      NA       3
#> 6  4  0     I      NA       5
#> 7  5  0     J      NA       6
# clean up
try({
  target$remove_array()
  empty_target$remove_array()
}, silent = T)

Aggregation

Group by and then Summarize

# Tip: Call `dbutils$db_aggregates()$to_df()` to see supported aggregate functions
arr_existing$
  group_by("Plant", "Type")$
  summarize(count = count(),
            uptake_sd = stdev(uptake),
            avg(conc), # unnamed fields are named by SciDB
            max(uptake))$
  to_df()
#>    Plant        Type count uptake_sd conc_avg uptake_max
#> 1    Qn1      Quebec     7  8.214766      435       39.7
#> 2    Mc1 Mississippi     7  4.118657      435       22.2
#> 3    Qc1      Quebec     7  8.334609      435       38.7
#> 4    Qn3      Quebec     7 10.349948      435       45.5
#> 5    Mc2 Mississippi     7  2.186974      435       14.4
#> 6    Qn2      Quebec     7 11.004069      435       44.3
#> 7    Mn1 Mississippi     7  8.694251      435       35.5
#> 8    Mc3 Mississippi     7  3.049044      435       19.9
#> 9    Mn3 Mississippi     7  6.484707      435       28.5
#> 10   Qc3      Quebec     7 10.321083      435       41.4
#> 11   Qc2      Quebec     7 11.336960      435       42.4
#> 12   Mn2 Mississippi     7  7.652855      435       32.4

# Aggregation result is still an array_op. So we can carry on.
arr_existing$
  group_by("Plant", "Type")$
  summarize(count = count(),
            uptake_sd = stdev(uptake),
            avg(conc), 
            max(uptake))$
  filter(uptake_sd > 8)$ # requires uptake standard deviation > 8
  to_df()
#>   Plant        Type count uptake_sd conc_avg uptake_max
#> 1   Qn1      Quebec     7  8.214766      435       39.7
#> 2   Qc1      Quebec     7  8.334609      435       38.7
#> 3   Qn3      Quebec     7 10.349948      435       45.5
#> 4   Qn2      Quebec     7 11.004069      435       44.3
#> 5   Mn1 Mississippi     7  8.694251      435       35.5
#> 6   Qc3      Quebec     7 10.321083      435       41.4
#> 7   Qc2      Quebec     7 11.336960      435       42.4

Summarize without group_by fields

Sometimes we need to get stats on array fields for all cells without any group_by fields, e.g. find the max/min value of a field.

arr_existing$summarize(
  count(), max(conc), stdev(uptake),
  max(i) # this is a dimension 
)$to_df()
#>   count conc_max uptake_stdev i_max
#> 1    84     1000     10.81441    84

Versions

Since we just modified arr_stored several times. Let’s check out how many versions it has so far and delete the version history.

arr_stored$list_versions()
#>   version_id           timestamp
#> 1          1 2025-04-10 18:03:06
#> 2          2 2025-04-10 18:03:07
#> 3          3 2025-04-10 18:03:07

# Check how many cells in each version
for(version_id in 1:3){
  print(sprintf("Version-%d has %d cells", version_id, arr_stored$version(version_id)$cell_count()))
}
#> [1] "Version-1 has 12 cells"
#> [1] "Version-2 has 12 cells"
#> [1] "Version-3 has 3 cells"

Remove all but the latest version.

arr_stored$remove_versions()
arr_stored$list_versions()
#>   version_id           timestamp
#> 1          3 2025-04-10 18:03:07

Read Files

fread method of a connection object takes a file path or multiple file paths as input, with other params, and returns an array_op which can be further processed to modify existing arrays or join with other arrays.

If we need to read data from all file columns, use fread without a template. This behaves similarly to data.table::fread.

# file column names and types are inferred by peeking into a few lines into the file
conn$fread("/file/path", header = T, sep = "\t") # default sep is "\t"

# col.names are V1, V2, ... types are inferred.
conn$fread("/file/path", header = F, nrow = 10) # infer column type using only up to first 10 rows

# file column types are inferred, but names are provided explicitly. col.names can be different than actual file column names but must be the same number as columns.
conn$fread("/file/path", header = T, col.names = c("chrom", "pos", "ref", "alt"))
conn$fread("/file/path", header = F, col.names = c("chrom", "pos", "ref", "alt")) # same as above, but read the first row of data

More commonly, we want to load data from file into an existing array. In this case, we provide a template (either array_op or schema string), only the columns whose name matches the template’s fields are imported.

NOTE: template field types supersedes actual file column types. If header=T, column names can still be inferred.

conn$fread("/file/path", template = templateArray, header = T)
# template can also be a schema string 
# When header = F, we need to provide the correct col.names so they match the template fields.
conn$fread("/file/path", template = "<ref:string, alt:string> [chrom;pos]", 
           header = F, col.names = c("chrom", "pos", "extra_col1", "ref", "alt", "extra_col2")) # extra_cols ignored

In examples above, fread generates AFL where fields are coerced to the template data types. If no template provided, we can think of the data.frame read from nrows lines of the file as a template. But no all files are perfectly formatted. We often need to error handle ill-formatted fields or missing data.

auto_dcast param, if set TRUE, will cast all numerical fields using dcast(a1, double(null)) where double can be any numerical types, and a1 is the matching field.

conn$fread("/file/path", template = myArray, header = T, auto_dcast = T)

In case of special field casting, we can provide mutate imported fields on the fly. @ is replaced with raw imported string fields, e.g. a0, a1, a2, …

conn$fread("/file/path", template = myArray, header = T,
           mutate_fields = list('chrom' = "iif(@='X', 23, iif(@='Y', 24, dcast(@, int64(null))))"))

# We can still dcast other numeric fields while casting `chrom` differently
conn$fread("/file/path", template = myArray, header = T, 
           auto_dcast = T,
           mutate_fields = list('chrom' = "iif(@='X', 23, iif(@='Y', 24, dcast(@, int64(null))))"))

Raw AFL

Hopefully arrayop captures all your common SciDB use cases. In case something is missing, we can run raw AFL statements directly on a ScidbConnection object.

To run AFL statements (in string aka. length-1 R character vector format), use one of the methods of a ScidbConnection:

  1. query if we expect results (array attributes as R data frames).
  2. query_all if we need both array dimensions and attributes.
  3. execute if we do not expect results
conn$query("list('operators')")
#>                         name              library
#> 1             add_attributes                scidb
#> 2              add_instances               system
#> 3           add_user_to_role           namespaces
#> 4                  aggregate                scidb
#> 5                  aio_input accelerated_io_tools
#> 6                   aio_save accelerated_io_tools
#> 7                      apply                scidb
#> 8                 apply_join            equi_join
#> 9                 attributes                scidb
#> 10                  avg_rank                scidb
#> 11                     begin                scidb
#> 12                 bernoulli                scidb
#> 13                   between                scidb
#> 14                     build                scidb
#> 15         builtin_equi_join                scidb
#> 16 builtin_grouped_aggregate                scidb
#> 17                    cancel                scidb
#> 18                      cast                scidb
#> 19               change_user           namespaces
#> 20                    commit                scidb
#> 21                   consume                scidb
#> 22              create_array                scidb
#> 23          create_namespace           namespaces
#> 24               create_role           namespaces
#> 25               create_user           namespaces
#> 26             cross_between                scidb
#> 27                cross_join                scidb
#> 28                  cumulate                scidb
#> 29                    delete                scidb
#> 30                dimensions                scidb
#> 31            drop_namespace           namespaces
#> 32                 drop_role           namespaces
#> 33                 drop_user           namespaces
#> 34       drop_user_from_role           namespaces
#> 35                      echo                scidb
#> 36                 equi_join            equi_join
#> 37                    filter                scidb
#> 38                   flatten                scidb
#> 39         grouped_aggregate    grouped_aggregate
#> 40                      hash                scidb
#> 41                      help                scidb
#> 42              index_lookup                scidb
#> 43                     input                scidb
#> 44                    insert                scidb
#> 45                      join                scidb
#> 46                     limit                scidb
#> 47                      list                scidb
#> 48      list_array_residency               system
#> 49            list_instances               system
#> 50              load_library                scidb
#> 51               load_module                scidb
#> 52               lock_arrays                scidb
#> 53                     merge                scidb
#> 54   move_array_to_namespace           namespaces
#> 55                     parse accelerated_io_tools
#> 56                   project                scidb
#> 57                  quantile                scidb
#> 58                      rank                scidb
#> 59               redimension                scidb
#> 60                    regrid                scidb
#> 61                    remove                scidb
#> 62          remove_instances               system
#> 63           remove_versions                scidb
#> 64                    rename                scidb
#> 65                    repart                scidb
#> 66                   reshape                scidb
#> 67                  rollback                scidb
#> 68                      save                scidb
#> 69                      scan                scidb
#> 70               secure_scan                scidb
#> 71             set_namespace           namespaces
#> 72      set_role_permissions           namespaces
#> 73                      show                scidb
#> 74            show_namespace           namespaces
#> 75     show_role_permissions           namespaces
#> 76       show_roles_for_user           namespaces
#> 77                 show_user           namespaces
#> 78     show_user_permissions           namespaces
#> 79        show_users_in_role           namespaces
#> 80                     slice                scidb
#> 81                      sort                scidb
#> 82                     split accelerated_io_tools
#> 83                     store                scidb
#> 84                  subarray                scidb
#> 85                 subdelete                scidb
#> 86                substitute                scidb
#> 87                 summarize                scidb
#> 88                      sync               system
#> 89          test_clear_cache                scidb
#> 90                 transpose                scidb
#> 91                    unfold                scidb
#> 92                      uniq                scidb
#> 93            unload_library                scidb
#> 94                    unpack                scidb
#> 95      unregister_instances               system
#> 96           variable_window                scidb
#> 97                  versions                scidb
#> 98                    window                scidb
#> 99                     xgrid                scidb
conn$query("op_scidbversion()")
#>   major minor patch
#> 1    23    10    14

Let’s create an array temp_array in the public namespace for later examples.

conn$execute("create array public.temp_array <a:int32, b:string, c:bool> [z]")
conn$query("show(temp_array)") # query(...) returns result attributes only
#>                                            schema distribution etcomp
#> 1 temp_array<a:int32,b:string,c:bool> [z=0:*:0:*]       hashed   none
conn$query_all("show(temp_array)") # query_all(...) returns result dimensions and attributes
#>   i                                          schema distribution etcomp
#> 1 0 temp_array<a:int32,b:string,c:bool> [z=0:*:0:*]       hashed   none

Chain AFL operators with pipes

We can pipe multiple SciDB operators in arrayop::afl function which returns an AFL string.

Depending on the number of operands:

  • afl(a | b) is converted to b(a)
  • afl(a | b(c)) is converted to b(a, c)
conn$query(afl(
  "'operators'" | list | filter("library != 'scidb'") | limit(10)
))
#>                name              library
#> 1     add_instances               system
#> 2  add_user_to_role           namespaces
#> 3         aio_input accelerated_io_tools
#> 4          aio_save accelerated_io_tools
#> 5        apply_join            equi_join
#> 6       change_user           namespaces
#> 7  create_namespace           namespaces
#> 8       create_role           namespaces
#> 9       create_user           namespaces
#> 10   drop_namespace           namespaces

Remove arrays

NOTE: removed arrays cannot be recovered. Always double check and back up if necessary before entering the danger zone.

try(conn$execute("remove(temp_array)"), silent = T) # manually remove

for(x in c(arr_uploaded, 
           arr_created, 
           arr_created_df, 
           arr_existing,
           arr_compiled, 
           arr_stored, 
           L, R
           )){
  if(x$exists_persistent_array()) { # can only remove persistent arrays
    x$remove_array()
  }
}