library(arrayop) # optional step to leave out the `arrayop::` prefix.
# First, load user name and token from an authentication file, which is also compatible with iquery.
auth = yaml::yaml.load_file("~/.scidb_auth")
# Then connect to scidb. Params host, username and token are all single strings and can be provided in any way
conn = db_connect(host = "localhost", username = auth[["user-name"]], token = auth[["user-password"]])
Once we have connected to scidb with db_connect
, we can get save the connection instance as shown above. Or call arrayop::get_default_connection()
anywhere in code, which always return the db connection.
identical(get_default_connection(), conn) # the exact same thing!
#> [1] TRUE
print(conn) # print out `username@host [scidb version]` if the connection is successful
#> ScidbConnection: root@127.0.0.1 [23.10.14]
We use the term array_op
to denote two types of arrays:
myNamespace.myArray
.Semantically, both types of arrays are the same, and support the same set of verbs. They also both have:
Regular arrays have at least one explicit dimensions. SciDB data frames have no explicit dimension; in SciDB versions through 22.5 there are two hidden dimensions whose names starts with a dollar sign ($
), while in SciDB 23.10 and onwards there are simply no dimensions at all.
# Upload a data frame to scidb
arr_uploaded = conn$upload_df(CO2, name = "test_CO2") # datasets::CO2
# Or just create a new array
arr_created = conn$create_array(
name = "test_CO2_empty",
schema_template = "<Plant:string,Type:string,Treatment:string,conc:double,uptake:double> [i]"
)
# Create a scidb data frame. Notice no dimension provided
arr_created_df = conn$create_array(
name = "test_CO2_empty_df",
schema_template = "<Plant:string,Type:string,Treatment:string,conc:double,uptake:double>" # leave out any dimension
)
# Retrieve an existing array (the one we just uploaded)
arr_existing = conn$array("test_CO2")
# Compile data.frame into a scidb 'build' operation with a build dimension
arr_compiled = conn$compile_df(head(CO2,4))
# Compile to a scidb data frame without explicit dimensions
arr_compiled_df = conn$compile_df(head(CO2,4), as_scidb_data_frame = T)
# Create a transient array by AFL
arr_from_afl = conn$afl_expr(arr_existing$filter(conc > 800)$to_afl())
# Store an AFL statement into a persistent array
arr_stored = conn$afl_expr(
# afl_str = "filter(test_CO2, conc > 800)" # equivalent to code below
arr_existing$filter(conc > 800)$to_afl()
)$persist(
save_array_name = "test_CO2_stored", .gc = F # prevent from being destroyed by R's garbage collected
)
Let’s inspect these arrays!
arr_uploaded$to_afl()
#> [1] "test_CO2"
arr_uploaded$to_schema_str()
#> [1] "<Plant:string,Type:string,Treatment:string,conc:double,uptake:double> [i=1:84:0:10000]"
arr_uploaded$is_persistent() # test if the afl is an array name, without checking in db
#> [1] TRUE
arr_uploaded$exists_persistent_array() # whether the persistent array exists in db (does check in db)
#> [1] TRUE
arr_uploaded$array_meta_data()
#> name uaid aid
#> 1 test_CO2 1 2
#> schema
#> 1 test_CO2<Plant:string,Type:string,Treatment:string,conc:double,uptake:double> [i=1:84:0:10000]
#> availability temporary namespace distribution etcomp
#> 1 TRUE FALSE public hashed none
arr_uploaded$summarize_array()
#> inst attid att count bytes chunks min_count avg_count max_count min_bytes
#> 1 0 0 all 84 2135 6 84 84 84 48
#> avg_bytes max_bytes
#> 1 355.8333 752
arr_uploaded$cell_count()
#> [1] 84
str(arr_created)
#> [1] "test_CO2_empty <Plant:string,Type:string,Treatment:string,conc:double,uptake:double> [i=0:*:0:*]"
str(arr_existing) # same as arr_uploaded
#> [1] "test_CO2 <Plant:string,Type:string,Treatment:string,conc:double,uptake:double> [i=1:84:0:10000]"
arr_compiled$to_afl()
#> [1] "build(<Plant:string,Type:string,Treatment:string,conc:double,uptake:double>[sWLkluueDl_], '[(\\'Qn1\\',\\'Quebec\\',\\'nonchilled\\', 95,16.0),(\\'Qn1\\',\\'Quebec\\',\\'nonchilled\\',175,30.4),(\\'Qn1\\',\\'Quebec\\',\\'nonchilled\\',250,34.8),(\\'Qn1\\',\\'Quebec\\',\\'nonchilled\\',350,37.2)]', true)"
arr_compiled$to_schema_str()
#> [1] "<Plant:string,Type:string,Treatment:string,conc:double,uptake:double> [sWLkluueDl_=0:*:0:1000000]"
arr_compiled$is_persistent()
#> [1] FALSE
# SciDB data frames
arr_created_df$to_schema_str()
#> [1] "<Plant:string,Type:string,Treatment:string,conc:double,uptake:double>"
arr_compiled_df$to_schema_str()
#> [1] "<Plant:string,Type:string,Treatment:string,conc:double,uptake:double>"
arr_from_afl$to_afl()
#> [1] "filter(test_CO2,conc > 800)"
arr_from_afl$cell_count()
#> [1] 12
str(arr_stored)
#> [1] "test_CO2_stored <Plant:string,Type:string,Treatment:string,conc:double,uptake:double> [i=1:84:0:10000]"
arr_stored$cell_count()
#> [1] 12
Call to_df
method on any array_op
instance to get a data frame with all its attributes; or to_df_all
to get all fields (both dimensions and attributes).
arr_stored$to_df()
#> Plant Type Treatment conc uptake
#> 1 Qn1 Quebec nonchilled 1000 39.7
#> 2 Qn2 Quebec nonchilled 1000 44.3
#> 3 Qn3 Quebec nonchilled 1000 45.5
#> 4 Qc1 Quebec chilled 1000 38.7
#> 5 Qc2 Quebec chilled 1000 42.4
#> 6 Qc3 Quebec chilled 1000 41.4
#> 7 Mn1 Mississippi nonchilled 1000 35.5
#> 8 Mn2 Mississippi nonchilled 1000 31.5
#> 9 Mn3 Mississippi nonchilled 1000 27.8
#> 10 Mc1 Mississippi chilled 1000 21.9
#> 11 Mc2 Mississippi chilled 1000 14.4
#> 12 Mc3 Mississippi chilled 1000 19.9
arr_stored$to_df_all() # Notice the dimension column [i]
#> i Plant Type Treatment conc uptake
#> 1 7 Qn1 Quebec nonchilled 1000 39.7
#> 2 14 Qn2 Quebec nonchilled 1000 44.3
#> 3 21 Qn3 Quebec nonchilled 1000 45.5
#> 4 28 Qc1 Quebec chilled 1000 38.7
#> 5 35 Qc2 Quebec chilled 1000 42.4
#> 6 42 Qc3 Quebec chilled 1000 41.4
#> 7 49 Mn1 Mississippi nonchilled 1000 35.5
#> 8 56 Mn2 Mississippi nonchilled 1000 31.5
#> 9 63 Mn3 Mississippi nonchilled 1000 27.8
#> 10 70 Mc1 Mississippi chilled 1000 21.9
#> 11 77 Mc2 Mississippi chilled 1000 14.4
#> 12 84 Mc3 Mississippi chilled 1000 19.9
Common array operations
We can create array_ops on the fly and download them as data frames. Each array operation also returns an array_op instance. So we can chain up the operations in a dplyr
like data flow. All the array operation verbs are named following dplyr
convention where appropriate, e.g. filter
, inner_join
, mutate
, group_by
and summarize
etc.
# Comma separated expressions are joined with logical AND
arr_existing$filter(conc > 80, uptake > 40)$to_df()
#> Plant Type Treatment conc uptake
#> 1 Qn2 Quebec nonchilled 350 41.8
#> 2 Qn2 Quebec nonchilled 500 40.6
#> 3 Qn2 Quebec nonchilled 675 41.4
#> 4 Qn2 Quebec nonchilled 1000 44.3
#> 5 Qn3 Quebec nonchilled 250 40.3
#> 6 Qn3 Quebec nonchilled 350 42.1
#> 7 Qn3 Quebec nonchilled 500 42.9
#> 8 Qn3 Quebec nonchilled 675 43.9
#> 9 Qn3 Quebec nonchilled 1000 45.5
#> 10 Qc2 Quebec chilled 1000 42.4
#> 11 Qc3 Quebec chilled 1000 41.4
# Equivalent to above
# arr_existing$filter(conc > 80 & uptake > 40)$to_df()
arr_existing$filter(conc <= 100 | conc > 900)$to_df() # logical OR
#> Plant Type Treatment conc uptake
#> 1 Qn1 Quebec nonchilled 95 16.0
#> 2 Qn1 Quebec nonchilled 1000 39.7
#> 3 Qn2 Quebec nonchilled 95 13.6
#> 4 Qn2 Quebec nonchilled 1000 44.3
#> 5 Qn3 Quebec nonchilled 95 16.2
#> 6 Qn3 Quebec nonchilled 1000 45.5
#> 7 Qc1 Quebec chilled 95 14.2
#> 8 Qc1 Quebec chilled 1000 38.7
#> 9 Qc2 Quebec chilled 95 9.3
#> 10 Qc2 Quebec chilled 1000 42.4
#> 11 Qc3 Quebec chilled 95 15.1
#> 12 Qc3 Quebec chilled 1000 41.4
#> 13 Mn1 Mississippi nonchilled 95 10.6
#> 14 Mn1 Mississippi nonchilled 1000 35.5
#> 15 Mn2 Mississippi nonchilled 95 12.0
#> 16 Mn2 Mississippi nonchilled 1000 31.5
#> 17 Mn3 Mississippi nonchilled 95 11.3
#> 18 Mn3 Mississippi nonchilled 1000 27.8
#> 19 Mc1 Mississippi chilled 95 10.5
#> 20 Mc1 Mississippi chilled 1000 21.9
#> 21 Mc2 Mississippi chilled 95 7.7
#> 22 Mc2 Mississippi chilled 1000 14.4
#> 23 Mc3 Mississippi chilled 95 10.6
#> 24 Mc3 Mississippi chilled 1000 19.9
# Filter on string fields
# %like% takes regex as the right operand; all other verbs treat strings literally.
arr_existing$filter(Plant %contains% "3", Plant %starts_with% "Q", Treatment %contains% "chill", Treatment %like% "chill[a-z]{2}$")$to_df()
#> Plant Type Treatment conc uptake
#> 1 Qc3 Quebec chilled 95 15.1
#> 2 Qc3 Quebec chilled 175 21.0
#> 3 Qc3 Quebec chilled 250 38.1
#> 4 Qc3 Quebec chilled 350 34.0
#> 5 Qc3 Quebec chilled 500 38.9
#> 6 Qc3 Quebec chilled 675 39.6
#> 7 Qc3 Quebec chilled 1000 41.4
To use a variable in a filter expression, prepend !!
, as shown below
min_conc <- 80
arr_existing$filter(conc > !!min_conc, uptake > 40)$to_df()
#> Plant Type Treatment conc uptake
#> 1 Qn2 Quebec nonchilled 350 41.8
#> 2 Qn2 Quebec nonchilled 500 40.6
#> 3 Qn2 Quebec nonchilled 675 41.4
#> 4 Qn2 Quebec nonchilled 1000 44.3
#> 5 Qn3 Quebec nonchilled 250 40.3
#> 6 Qn3 Quebec nonchilled 350 42.1
#> 7 Qn3 Quebec nonchilled 500 42.9
#> 8 Qn3 Quebec nonchilled 675 43.9
#> 9 Qn3 Quebec nonchilled 1000 45.5
#> 10 Qc2 Quebec chilled 1000 42.4
#> 11 Qc3 Quebec chilled 1000 41.4
As shown below, field names can be either quoted or not quoted. But the field expression must be a single string. Unlike R, SciDB uses single quotes for string literals. E.g. 'at'
is single-quoted.
Like filter expressions, use !!
to reference variables in memory.
# Modify existing fields
result = arr_stored$mutate(Type = 'at_' + Type, conc = conc/1000)
result$to_afl()
#> [1] "project(project(apply(project(apply(test_CO2_stored,_Type,'at_' + Type,_conc,conc / 1000),Type,conc,inverse:true),Type,_Type,conc,_conc),_Type,_conc,inverse:true),Plant,Type,Treatment,conc,uptake)"
result$to_df()
#> Plant Type Treatment conc uptake
#> 1 Qn1 at_Quebec nonchilled 1 39.7
#> 2 Qn2 at_Quebec nonchilled 1 44.3
#> 3 Qn3 at_Quebec nonchilled 1 45.5
#> 4 Qc1 at_Quebec chilled 1 38.7
#> 5 Qc2 at_Quebec chilled 1 42.4
#> 6 Qc3 at_Quebec chilled 1 41.4
#> 7 Mn1 at_Mississippi nonchilled 1 35.5
#> 8 Mn2 at_Mississippi nonchilled 1 31.5
#> 9 Mn3 at_Mississippi nonchilled 1 27.8
#> 10 Mc1 at_Mississippi chilled 1 21.9
#> 11 Mc2 at_Mississippi chilled 1 14.4
#> 12 Mc3 at_Mississippi chilled 1 19.9
# Add new fields
result = arr_stored$mutate(Plant_Type = Plant + '-' + Type, feel = if(Treatment == 'chilled') 'cold' else 'ok')
# Same as above. Notice the `iif` operator is explicitly written here,
# equivalent to native R `if` statement
# result = arr_stored$mutate(Plant_Type = Plant + '-' + Type, feel = iif(Treatment == 'chilled', 'cold', 'ok'))
result$to_afl()
#> [1] "apply(test_CO2_stored,Plant_Type,(Plant + '-') + Type,feel,iif(Treatment = 'chilled', 'cold', 'ok'))"
result$to_df()
#> Plant Type Treatment conc uptake Plant_Type feel
#> 1 Qn1 Quebec nonchilled 1000 39.7 Qn1-Quebec ok
#> 2 Qn2 Quebec nonchilled 1000 44.3 Qn2-Quebec ok
#> 3 Qn3 Quebec nonchilled 1000 45.5 Qn3-Quebec ok
#> 4 Qc1 Quebec chilled 1000 38.7 Qc1-Quebec cold
#> 5 Qc2 Quebec chilled 1000 42.4 Qc2-Quebec cold
#> 6 Qc3 Quebec chilled 1000 41.4 Qc3-Quebec cold
#> 7 Mn1 Mississippi nonchilled 1000 35.5 Mn1-Mississippi ok
#> 8 Mn2 Mississippi nonchilled 1000 31.5 Mn2-Mississippi ok
#> 9 Mn3 Mississippi nonchilled 1000 27.8 Mn3-Mississippi ok
#> 10 Mc1 Mississippi chilled 1000 21.9 Mc1-Mississippi cold
#> 11 Mc2 Mississippi chilled 1000 14.4 Mc2-Mississippi cold
#> 12 Mc3 Mississippi chilled 1000 19.9 Mc3-Mississippi cold
mutate
operation keeps the original fields unless we modify them.
transmute
allows us to explicitly specify which fields to keep.
# strings are escaped for AFL
result = arr_stored$transmute(Plant, Plant_Type = Plant + "'-'" + Type, feel = iif(Treatment == 'chilled', 'c"ooo"ld', "OK"))
result$to_afl()
#> [1] "project(apply(test_CO2_stored,Plant_Type,(Plant + '\\'-\\'') + Type,feel,iif(Treatment = 'chilled', 'c\"ooo\"ld', 'OK')),Plant,Plant_Type,feel)"
result$to_df()
#> Plant Plant_Type feel
#> 1 Qn1 Qn1'-'Quebec OK
#> 2 Qn2 Qn2'-'Quebec OK
#> 3 Qn3 Qn3'-'Quebec OK
#> 4 Qc1 Qc1'-'Quebec c"ooo"ld
#> 5 Qc2 Qc2'-'Quebec c"ooo"ld
#> 6 Qc3 Qc3'-'Quebec c"ooo"ld
#> 7 Mn1 Mn1'-'Mississippi OK
#> 8 Mn2 Mn2'-'Mississippi OK
#> 9 Mn3 Mn3'-'Mississippi OK
#> 10 Mc1 Mc1'-'Mississippi c"ooo"ld
#> 11 Mc2 Mc2'-'Mississippi c"ooo"ld
#> 12 Mc3 Mc3'-'Mississippi c"ooo"ld
mutate
and transmute
are useful if the mutated fields are in expressions. Another common use case is mutating fields of an existing array by another array, rather than by expressions.
dataArray = conn$array_from_df(data.frame(Plant = c("Qn1", "Qn2", "Qn3"), Treatment = "treated-mutated"))
result = arr_stored$mutate_by(dataArray, keys = "Plant", updated_fields = "Treatment")
result$to_afl()
#> [1] "project(join(project(redimension(equi_join(project(build(<Plant:string,Treatment:string>[IEEX_], '[(\\'Qn1\\',\\'treated-mutated\\'),(\\'Qn2\\',\\'treated-mutated\\'),(\\'Qn3\\',\\'treated-mutated\\')]', true),Treatment,Plant) as _L, project(apply(test_CO2_stored,i,i),i,Plant) as _R, left_names:_L.Plant, right_names:_R.Plant),<Treatment:string> [i=1:84:0:10000]),Treatment),project(test_CO2_stored,Plant,Type,conc,uptake)),Plant,Type,Treatment,conc,uptake)"
result$to_df()
#> Plant Type Treatment conc uptake
#> 1 Qn1 Quebec treated-mutated 1000 39.7
#> 2 Qn2 Quebec treated-mutated 1000 44.3
#> 3 Qn3 Quebec treated-mutated 1000 45.5
NOTE: the source arrays remain the same in examples above. All mutate operations just created mutated data copies of the source.
To modify array content, we can update
or overwrite
the source array with mutated data copies.
dataArray = conn$array_from_df(data.frame(Plant = c("Qn1", "Qn2", "Qn3"), Treatment = "treated-mutated"))
mutated = arr_stored$mutate_by(dataArray, keys = "Plant", updated_fields = "Treatment")
# Persist data changes to an existing array
mutated$update(arr_stored)$execute()
arr_stored$to_df() # Only matched data cells are updated
#> Plant Type Treatment conc uptake
#> 1 Qn1 Quebec treated-mutated 1000 39.7
#> 2 Qn2 Quebec treated-mutated 1000 44.3
#> 3 Qn3 Quebec treated-mutated 1000 45.5
#> 4 Qc1 Quebec chilled 1000 38.7
#> 5 Qc2 Quebec chilled 1000 42.4
#> 6 Qc3 Quebec chilled 1000 41.4
#> 7 Mn1 Mississippi nonchilled 1000 35.5
#> 8 Mn2 Mississippi nonchilled 1000 31.5
#> 9 Mn3 Mississippi nonchilled 1000 27.8
#> 10 Mc1 Mississippi chilled 1000 21.9
#> 11 Mc2 Mississippi chilled 1000 14.4
#> 12 Mc3 Mississippi chilled 1000 19.9
mutated$overwrite(arr_stored)$execute()
arr_stored$to_df() # Caution: Overwrite all array content!!!
#> Plant Type Treatment conc uptake
#> 1 Qn1 Quebec treated-mutated 1000 39.7
#> 2 Qn2 Quebec treated-mutated 1000 44.3
#> 3 Qn3 Quebec treated-mutated 1000 45.5
Four joins are supported: inner_join
, left_join
, right_join
, and full_join
.
First, let’s populate two arrays.
leftDf = data.frame(lda = 1:5, db = 11:15, fa = LETTERS[1:5], lfb = 3.14 * 1:5)
rightDf = data.frame(rda = 3:10, db = 13:20, fa = LETTERS[3:10], rfb = 3.14 * 3:10)
leftDf
#> lda db fa lfb
#> 1 1 11 A 3.14
#> 2 2 12 B 6.28
#> 3 3 13 C 9.42
#> 4 4 14 D 12.56
#> 5 5 15 E 15.70
rightDf
#> rda db fa rfb
#> 1 3 13 C 9.42
#> 2 4 14 D 12.56
#> 3 5 15 E 15.70
#> 4 6 16 F 18.84
#> 5 7 17 G 21.98
#> 6 8 18 H 25.12
#> 7 9 19 I 28.26
#> 8 10 20 J 31.40
# At least one input to 'cross_join' must have a specified chunk size.
# So we need to persist the arrays
L = conn$array_from_df(leftDf, "<fa:string, lfb:double> [lda;db] ", force_template_schema = T)$persist(.gc = F)
R = conn$array_from_df(rightDf, "<fa:string, rfb:double> [rda;db] ", force_template_schema = T)$persist(.gc = F)
Then, start with a vanilla join where join keys are inferred from array fields. In this case, fa
and db
are join keys since they exist in both arrays.
L$inner_join(R)$to_df()
#> lda db fa lfb rda rfb
#> 1 3 13 C 9.42 3 9.42
#> 2 4 14 D 12.56 4 12.56
#> 3 5 15 E 15.70 5 15.70
L$left_join(R)$to_df()
#> lda db fa lfb rda rfb
#> 1 3 13 C 9.42 3 9.42
#> 2 1 11 A 3.14 NA NA
#> 3 4 14 D 12.56 4 12.56
#> 4 5 15 E 15.70 5 15.70
#> 5 2 12 B 6.28 NA NA
L$right_join(R)$to_df()
#> lda db fa lfb rda rfb
#> 1 NA 18 H NA 8 25.12
#> 2 3 13 C 9.42 3 9.42
#> 3 NA 20 J NA 10 31.40
#> 4 4 14 D 12.56 4 12.56
#> 5 NA 16 F NA 6 18.84
#> 6 NA 17 G NA 7 21.98
#> 7 NA 19 I NA 9 28.26
#> 8 5 15 E 15.70 5 15.70
L$full_join(R)$to_df()
#> lda db fa lfb rda rfb
#> 1 1 11 A 3.14 NA NA
#> 2 3 13 C 9.42 3 9.42
#> 3 NA 18 H NA 8 25.12
#> 4 4 14 D 12.56 4 12.56
#> 5 NA 16 F NA 6 18.84
#> 6 NA 17 G NA 7 21.98
#> 7 NA 20 J NA 10 31.40
#> 8 5 15 E 15.70 5 15.70
#> 9 NA 19 I NA 9 28.26
#> 10 2 12 B 6.28 NA NA
We can specify which fields to keep with array_op$select
. By default, all fields are selected, except that the join keys only exist once in the result even if they are selected on both operands.
L$select("lfb", "lda")$
inner_join(R)$to_df()
#> lfb lda rda rfb
#> 1 9.42 3 3 9.42
#> 2 12.56 4 4 12.56
#> 3 15.70 5 5 15.70
L$inner_join(R$select("rda", "rfb"))$to_df()
#> lda db fa lfb rda rfb
#> 1 3 13 C 9.42 3 9.42
#> 2 4 14 D 12.56 4 12.56
#> 3 5 15 E 15.70 5 15.70
L$select("lfb", "db")$inner_join(R$select("rfb"))$to_df()
#> lfb db rfb
#> 1 9.42 13 9.42
#> 2 12.56 14 12.56
#> 3 15.70 15 15.70
We can specify the join keys and aliases explicitly. Array aliases are important to disambiguate overlapping fields.
# `db` as join key, `fa` exists in both arrays and will be suffixed with array aliases in the result
L$left_join(R, by = c('db'), left_alias = "_LL", right_alias = "_RR")$to_df()
#> lda db fa_LL lfb rda fa_RR rfb
#> 1 5 15 E 15.70 5 E 15.70
#> 2 2 12 B 6.28 NA <NA> NA
#> 3 3 13 C 9.42 3 C 9.42
#> 4 1 11 A 3.14 NA <NA> NA
#> 5 4 14 D 12.56 4 D 12.56
# different join keys
L$right_join(R, by.x = "lda", by.y = "rda", left_alias = "_LL", right_alias = "_RR")$to_df()
#> lda db_LL fa_LL lfb db_RR fa_RR rfb
#> 1 3 13 C 9.42 13 C 9.42
#> 2 7 NA <NA> NA 17 G 21.98
#> 3 8 NA <NA> NA 18 H 25.12
#> 4 6 NA <NA> NA 16 F 18.84
#> 5 10 NA <NA> NA 20 J 31.40
#> 6 4 14 D 12.56 14 D 12.56
#> 7 9 NA <NA> NA 19 I 28.26
#> 8 5 15 E 15.70 15 E 15.70
By default, joins are implemented by the SciDB equi_join
operator, which works both when the join keys are array dimensions and when they are attributes, or a mix of the two. However, the drawback of equi_join
is that it materializes its results, which can be undesirable in a complex nested join.
Two other join modes which avoid materialization, apply_join
and cross_join
, are available in some cases but have other restrictions.
The apply_join
mode is available for inner_join
and left_join
; it is not an option for the right_join
operator but could be used by reordering the inputs to express the right outer join as a left outer join. Like equi_join
, the join keys for apply_join
may be a mix of array dimensions and attributes. Unlike equi_join
, it does not materialize its results, and the returned array uses the same dimensions as the input left-hand array, and so it has the same schema as the left-hand array up to the addition of new attributes (or, in the case of left outer joins, adding nullable flags to the current attributes). However, it does so at the cost of replicating the right-hand array to all the SciDB instances, and so is inappropriate for very large right-hand inputs.
L$inner_join(R, join_mode='apply_join')$to_df_all()
#> lda db fa lfb rda rfb
#> 1 3 13 C 9.42 3 9.42
#> 2 4 14 D 12.56 4 12.56
#> 3 5 15 E 15.70 5 15.70
L$left_join(R, join_mode='apply_join')$to_df_all()
#> lda db fa lfb rda rfb
#> 1 1 11 A 3.14 NA NA
#> 2 2 12 B 6.28 NA NA
#> 3 3 13 C 9.42 3 9.42
#> 4 4 14 D 12.56 4 12.56
#> 5 5 15 E 15.70 5 15.70
Note that, unlike equi_join
, the dimensions of the left-hand input do not become attributes of the output, and so they need to be explicitly selected to be returned when using to_df()
and not to_df_all()
, as shown above. A second restriction is that apply_join
has non-standard behavior for a join when there multiple matches for the keys on the right-hand input. Because the output uses the dimensions of the left-hand input, it cannot contain more than one record per original left-hand record; so, when there are multiple matches from the right-hand side, only one is chosen arbitrarily to populate the joined attributes of the output.
The cross_join
mode is available for inner_join
, but no other join. This mode does not materialize results, but is valid only when the join keys are dimensions for both the left-hand and right-hand arrays.
L$inner_join(R,
by.x = c('db', 'lda'),
by.y = c('db', 'rda'),
join_mode = 'cross_join', left_alias = "_LL", right_alias = "_RR")$
to_df()
#> lda db fa_LL lfb fa_RR rfb
#> 1 3 13 C 9.42 C 9.42
#> 2 4 14 D 12.56 D 12.56
#> 3 5 15 E 15.70 E 15.70
When the join keys are a strict subset of the dimensions of the right-hand array, the output contains, as expected, the Cartesian product of the matched records.
result = L$inner_join(R,
settings = list(
algorithm = "'hash_replicate_right'", # string setting item
hash_join_threshold = 1024 # numeric setting item
))
result$to_afl()
#> [1] "equi_join(project(apply(Rarrayop_375733fbcfb3,lda,lda),lda,fa,lfb) as _L, project(apply(Rarrayop_3757609a5129,rda,rda),rda,fa,rfb) as _R, left_names:(_L.db,_L.fa), right_names:(_R.db,_R.fa), algorithm:'hash_replicate_right', hash_join_threshold:1024)"
result$to_df()
#> lda db fa lfb rda rfb
#> 1 3 13 C 9.42 3 9.42
#> 2 4 14 D 12.56 4 12.56
#> 3 5 15 E 15.70 5 15.70
In many SQL dialects, we can define an auto incremented field. When new records are inserted, the field is auto incremented. SciDB currently does not support this feature.
Given a data source array_op (normally created from R data frame or conn$fread), and a target template array which we insert data into, we can achieve this feature in two steps:
Let’s see an example. First, create a target array with 3 attrs and 2 dimensions; and a source array with some data. Notice we set the source array’s build_dim_spec = 'z'
so that we can refer to it later.
AutoFieldsContent = data.frame(da=1:10, db=1:10, f_str = letters[1:10], f_int32=1:10, f_int64=11:20)
target = conn$array_from_df(AutoFieldsContent,
template = "<f_str:string, f_int32:int32, f_int64:int64> [da=0:*:0:1000; db=0:*:0:1000]",
force_template_schema = T)$
persist(.temp=T, .gc = F)
empty_target = conn$create_array(dbutils$random_array_name(), target, .temp = T)
source = conn$compile_df(data.frame(f_str = LETTERS[1:3]), template = target, build_dim_spec = "z")
source$set_auto_fields(target)$to_df() # same as below
#> f_str da db f_int32 f_int64
#> 1 A 11 11 11 21
#> 2 B 12 12 12 22
#> 3 C 13 13 13 23
# source$set_auto_fields(target, source_auto_increment=c(z=0), target_auto_increment=c(da=0, db=0, f_int32=0))$to_df()
# # target_auto_increment does not matter if the target array is not empty
source$set_auto_fields(target, source_auto_increment=c(z=0), target_auto_increment=c(da=1, db=2, f_int32=3))$to_df()
#> f_str da db f_int32
#> 1 A 11 11 11
#> 2 B 12 12 12
#> 3 C 13 13 13
# # target_auto_increment affects the initial field values if the target array is empty
source$set_auto_fields(empty_target, source_auto_increment=c(z=0), target_auto_increment=c(da=1, db=2, f_int32=3))$to_df()
#> f_str da db f_int32
#> 1 A 1 2 3
#> 2 B 2 3 4
#> 3 C 3 4 5
# By default, both source and target auto increment start from 0
# The only dimension in source is used as the source_auto_increment field; otherwise, we need to specify the source field
# All the missing fields are assumed the target_auto_increment fields unless provided otherwise.
source$set_auto_fields(empty_target)$to_df()
#> f_str da db f_int32 f_int64
#> 1 A 0 0 0 0
#> 2 B 1 1 1 1
#> 3 C 2 2 2 2
source$set_auto_fields(empty_target, target_auto_increment = c("da", "db"))$to_df()
#> f_str da db
#> 1 A 0 0
#> 2 B 1 1
#> 3 C 2 2
In a less common case, we may need to populate a dimension field to avoid cell coordinate collisions.
AutoFieldsContent
#> da db f_str f_int32 f_int64
#> 1 1 1 a 1 11
#> 2 2 2 b 2 12
#> 3 3 3 c 3 13
#> 4 4 4 d 4 14
#> 5 5 5 e 5 15
#> 6 6 6 f 6 16
#> 7 7 7 g 7 17
#> 8 8 8 h 8 18
#> 9 9 9 i 9 19
#> 10 10 10 j 10 20
conn$
compile_df(data.frame(f_str = LETTERS[4:10], f_int32 = 4:10, da = c(1,2,1,2,1,4,5)), template = target)$
set_auto_fields(target,
anti_collision_field = 'db')$
to_df() |> dplyr::arrange(da, db)
#> da db f_str f_int32
#> 1 1 2 D 4
#> 2 1 3 F 6
#> 3 1 4 H 8
#> 4 2 3 E 5
#> 5 2 4 G 7
#> 6 4 5 I 9
#> 7 5 6 J 10
conn$
compile_df(data.frame(f_str = LETTERS[4:10], f_int32 = 4:10, da = c(1,2,1,2,1,4,5)), template = empty_target)$
set_auto_fields(empty_target,
anti_collision_field = 'db')$
to_df() |> dplyr::arrange(da, db)
#> da db f_str f_int32
#> 1 1 0 D 4
#> 2 1 1 F 6
#> 3 1 2 H 8
#> 4 2 0 E 5
#> 5 2 1 G 7
#> 6 4 0 I 9
#> 7 5 0 J 10
Notice there are no duplicated tuples of (da, db) so they can be safely inserted into the target array without coordinate collisions.
We can combine the two types of auto fields in one run and update the target array.
conn$
compile_df(data.frame(f_str = LETTERS[4:10], da = c(1,2,1,2,1,4,5)), template = target, build_dim_spec = "z")$
set_auto_fields(target,
source_auto_increment = "z",
target_auto_increment = "f_int64",
anti_collision_field = 'db')$
mutate(f_int32 = int32(null))$
change_schema(target)$
update(target)$
execute()
target$to_df_all()
#> da db f_str f_int32 f_int64
#> 1 1 1 a 1 11
#> 2 1 2 D NA 21
#> 3 1 3 F NA 23
#> 4 1 4 H NA 25
#> 5 2 2 b 2 12
#> 6 2 3 E NA 22
#> 7 2 4 G NA 24
#> 8 3 3 c 3 13
#> 9 4 4 d 4 14
#> 10 4 5 I NA 26
#> 11 5 5 e 5 15
#> 12 5 6 J NA 27
#> 13 6 6 f 6 16
#> 14 7 7 g 7 17
#> 15 8 8 h 8 18
#> 16 9 9 i 9 19
#> 17 10 10 j 10 20
conn$
compile_df(data.frame(f_str = LETTERS[4:10], da = c(1,2,1,2,1,4,5)), template = target, build_dim_spec = "z")$
set_auto_fields(empty_target,
source_auto_increment = "z",
target_auto_increment = "f_int64",
anti_collision_field = 'db')$
mutate(f_int32 = int32(null))$
change_schema(empty_target)$
update(empty_target)$
execute()
empty_target$to_df_all()
#> da db f_str f_int32 f_int64
#> 1 1 0 D NA 0
#> 2 1 1 F NA 2
#> 3 1 2 H NA 4
#> 4 2 0 E NA 1
#> 5 2 1 G NA 3
#> 6 4 0 I NA 5
#> 7 5 0 J NA 6
# clean up
try({
target$remove_array()
empty_target$remove_array()
}, silent = T)
# Tip: Call `dbutils$db_aggregates()$to_df()` to see supported aggregate functions
arr_existing$
group_by("Plant", "Type")$
summarize(count = count(),
uptake_sd = stdev(uptake),
avg(conc), # unnamed fields are named by SciDB
max(uptake))$
to_df()
#> Plant Type count uptake_sd conc_avg uptake_max
#> 1 Qn1 Quebec 7 8.214766 435 39.7
#> 2 Mc1 Mississippi 7 4.118657 435 22.2
#> 3 Qc1 Quebec 7 8.334609 435 38.7
#> 4 Qn3 Quebec 7 10.349948 435 45.5
#> 5 Mc2 Mississippi 7 2.186974 435 14.4
#> 6 Qn2 Quebec 7 11.004069 435 44.3
#> 7 Mn1 Mississippi 7 8.694251 435 35.5
#> 8 Mc3 Mississippi 7 3.049044 435 19.9
#> 9 Mn3 Mississippi 7 6.484707 435 28.5
#> 10 Qc3 Quebec 7 10.321083 435 41.4
#> 11 Qc2 Quebec 7 11.336960 435 42.4
#> 12 Mn2 Mississippi 7 7.652855 435 32.4
# Aggregation result is still an array_op. So we can carry on.
arr_existing$
group_by("Plant", "Type")$
summarize(count = count(),
uptake_sd = stdev(uptake),
avg(conc),
max(uptake))$
filter(uptake_sd > 8)$ # requires uptake standard deviation > 8
to_df()
#> Plant Type count uptake_sd conc_avg uptake_max
#> 1 Qn1 Quebec 7 8.214766 435 39.7
#> 2 Qc1 Quebec 7 8.334609 435 38.7
#> 3 Qn3 Quebec 7 10.349948 435 45.5
#> 4 Qn2 Quebec 7 11.004069 435 44.3
#> 5 Mn1 Mississippi 7 8.694251 435 35.5
#> 6 Qc3 Quebec 7 10.321083 435 41.4
#> 7 Qc2 Quebec 7 11.336960 435 42.4
Since we just modified arr_stored
several times. Let’s check out how many versions it has so far and delete the version history.
arr_stored$list_versions()
#> version_id timestamp
#> 1 1 2025-04-10 18:03:06
#> 2 2 2025-04-10 18:03:07
#> 3 3 2025-04-10 18:03:07
# Check how many cells in each version
for(version_id in 1:3){
print(sprintf("Version-%d has %d cells", version_id, arr_stored$version(version_id)$cell_count()))
}
#> [1] "Version-1 has 12 cells"
#> [1] "Version-2 has 12 cells"
#> [1] "Version-3 has 3 cells"
Remove all but the latest version.
arr_stored$remove_versions()
arr_stored$list_versions()
#> version_id timestamp
#> 1 3 2025-04-10 18:03:07
fread
method of a connection object takes a file path or multiple file paths as input, with other params, and returns an array_op which can be further processed to modify existing arrays or join with other arrays.
If we need to read data from all file columns, use fread
without a template. This behaves similarly to data.table::fread
.
# file column names and types are inferred by peeking into a few lines into the file
conn$fread("/file/path", header = T, sep = "\t") # default sep is "\t"
# col.names are V1, V2, ... types are inferred.
conn$fread("/file/path", header = F, nrow = 10) # infer column type using only up to first 10 rows
# file column types are inferred, but names are provided explicitly. col.names can be different than actual file column names but must be the same number as columns.
conn$fread("/file/path", header = T, col.names = c("chrom", "pos", "ref", "alt"))
conn$fread("/file/path", header = F, col.names = c("chrom", "pos", "ref", "alt")) # same as above, but read the first row of data
More commonly, we want to load data from file into an existing array. In this case, we provide a template (either array_op or schema string), only the columns whose name matches the template’s fields are imported.
NOTE: template
field types supersedes actual file column types. If header=T
, column names can still be inferred.
conn$fread("/file/path", template = templateArray, header = T)
# template can also be a schema string
# When header = F, we need to provide the correct col.names so they match the template fields.
conn$fread("/file/path", template = "<ref:string, alt:string> [chrom;pos]",
header = F, col.names = c("chrom", "pos", "extra_col1", "ref", "alt", "extra_col2")) # extra_cols ignored
In examples above, fread
generates AFL where fields are coerced to the template data types. If no template provided, we can think of the data.frame read from nrows
lines of the file as a template. But no all files are perfectly formatted. We often need to error handle ill-formatted fields or missing data.
auto_dcast
param, if set TRUE, will cast all numerical fields using dcast(a1, double(null))
where double
can be any numerical types, and a1
is the matching field.
conn$fread("/file/path", template = myArray, header = T, auto_dcast = T)
In case of special field casting, we can provide mutate imported fields on the fly. @
is replaced with raw imported string fields, e.g. a0, a1, a2, …
conn$fread("/file/path", template = myArray, header = T,
mutate_fields = list('chrom' = "iif(@='X', 23, iif(@='Y', 24, dcast(@, int64(null))))"))
# We can still dcast other numeric fields while casting `chrom` differently
conn$fread("/file/path", template = myArray, header = T,
auto_dcast = T,
mutate_fields = list('chrom' = "iif(@='X', 23, iif(@='Y', 24, dcast(@, int64(null))))"))
Hopefully arrayop
captures all your common SciDB use cases. In case something is missing, we can run raw AFL statements directly on a ScidbConnection object.
To run AFL statements (in string aka. length-1 R character vector format), use one of the methods of a ScidbConnection
:
query
if we expect results (array attributes as R data frames).query_all
if we need both array dimensions and attributes.execute
if we do not expect results
conn$query("list('operators')")
#> name library
#> 1 add_attributes scidb
#> 2 add_instances system
#> 3 add_user_to_role namespaces
#> 4 aggregate scidb
#> 5 aio_input accelerated_io_tools
#> 6 aio_save accelerated_io_tools
#> 7 apply scidb
#> 8 apply_join equi_join
#> 9 attributes scidb
#> 10 avg_rank scidb
#> 11 begin scidb
#> 12 bernoulli scidb
#> 13 between scidb
#> 14 build scidb
#> 15 builtin_equi_join scidb
#> 16 builtin_grouped_aggregate scidb
#> 17 cancel scidb
#> 18 cast scidb
#> 19 change_user namespaces
#> 20 commit scidb
#> 21 consume scidb
#> 22 create_array scidb
#> 23 create_namespace namespaces
#> 24 create_role namespaces
#> 25 create_user namespaces
#> 26 cross_between scidb
#> 27 cross_join scidb
#> 28 cumulate scidb
#> 29 delete scidb
#> 30 dimensions scidb
#> 31 drop_namespace namespaces
#> 32 drop_role namespaces
#> 33 drop_user namespaces
#> 34 drop_user_from_role namespaces
#> 35 echo scidb
#> 36 equi_join equi_join
#> 37 filter scidb
#> 38 flatten scidb
#> 39 grouped_aggregate grouped_aggregate
#> 40 hash scidb
#> 41 help scidb
#> 42 index_lookup scidb
#> 43 input scidb
#> 44 insert scidb
#> 45 join scidb
#> 46 limit scidb
#> 47 list scidb
#> 48 list_array_residency system
#> 49 list_instances system
#> 50 load_library scidb
#> 51 load_module scidb
#> 52 lock_arrays scidb
#> 53 merge scidb
#> 54 move_array_to_namespace namespaces
#> 55 parse accelerated_io_tools
#> 56 project scidb
#> 57 quantile scidb
#> 58 rank scidb
#> 59 redimension scidb
#> 60 regrid scidb
#> 61 remove scidb
#> 62 remove_instances system
#> 63 remove_versions scidb
#> 64 rename scidb
#> 65 repart scidb
#> 66 reshape scidb
#> 67 rollback scidb
#> 68 save scidb
#> 69 scan scidb
#> 70 secure_scan scidb
#> 71 set_namespace namespaces
#> 72 set_role_permissions namespaces
#> 73 show scidb
#> 74 show_namespace namespaces
#> 75 show_role_permissions namespaces
#> 76 show_roles_for_user namespaces
#> 77 show_user namespaces
#> 78 show_user_permissions namespaces
#> 79 show_users_in_role namespaces
#> 80 slice scidb
#> 81 sort scidb
#> 82 split accelerated_io_tools
#> 83 store scidb
#> 84 subarray scidb
#> 85 subdelete scidb
#> 86 substitute scidb
#> 87 summarize scidb
#> 88 sync system
#> 89 test_clear_cache scidb
#> 90 transpose scidb
#> 91 unfold scidb
#> 92 uniq scidb
#> 93 unload_library scidb
#> 94 unpack scidb
#> 95 unregister_instances system
#> 96 variable_window scidb
#> 97 versions scidb
#> 98 window scidb
#> 99 xgrid scidb
conn$query("op_scidbversion()")
#> major minor patch
#> 1 23 10 14
Let’s create an array temp_array
in the public
namespace for later examples.
conn$execute("create array public.temp_array <a:int32, b:string, c:bool> [z]")
conn$query("show(temp_array)") # query(...) returns result attributes only
#> schema distribution etcomp
#> 1 temp_array<a:int32,b:string,c:bool> [z=0:*:0:*] hashed none
conn$query_all("show(temp_array)") # query_all(...) returns result dimensions and attributes
#> i schema distribution etcomp
#> 1 0 temp_array<a:int32,b:string,c:bool> [z=0:*:0:*] hashed none
We can pipe multiple SciDB operators in arrayop::afl
function which returns an AFL string.
Depending on the number of operands:
afl(a | b)
is converted to b(a)
afl(a | b(c))
is converted to b(a, c)
conn$query(afl(
"'operators'" | list | filter("library != 'scidb'") | limit(10)
))
#> name library
#> 1 add_instances system
#> 2 add_user_to_role namespaces
#> 3 aio_input accelerated_io_tools
#> 4 aio_save accelerated_io_tools
#> 5 apply_join equi_join
#> 6 change_user namespaces
#> 7 create_namespace namespaces
#> 8 create_role namespaces
#> 9 create_user namespaces
#> 10 drop_namespace namespaces