natverse / neuprintr

R client utilities for interacting with the neuPrint connectome analysis service
http://natverse.org/neuprintr
3 stars 3 forks source link

enable exact searches for neo4j numeric values #146

Closed jefferis closed 2 years ago

jefferis commented 2 years ago

e.g. for vnc group=12345 gets turned into group='12345' under the hood which will not match. See https://community.neo4j.com/t/checking-a-field-data-type/30773 for an example of finding the type of a field – may need to add this to neuprint_get_fields.

jefferis commented 2 years ago

Hmm to do this, I need to figure out the type of fields. I tried two approaches, one using the R field inferred from parsing JSON the other coming straight from neo4j.

mnf=malevnc:::mnp_fields()
neuprint_typeof <- function(field, cache=TRUE, ...) {
  q=glue::glue(
  "
    MATCH (n:Neuron) 
    WHERE exists(n.`{field}`) 
    RETURN n.{field} AS {field}
    LIMIT 1 
  "
  )
  q=gsub("\\s+", " ", q)
  r=try(neuprintr::neuprint_fetch_custom(q, include_headers = FALSE, cache = cache, ...))
  if(inherits(r, 'try-error')) NA_character_
  else mode(unlist(r$data, use.names = F))
}

neuprint_typeof2 <- function(field, conn=NULL, dataset=NULL, cache=TRUE, ...) {
  q="
    MATCH (n:Neuron)
    WHERE exists(n.`{field}`)
    RETURN apoc.meta.type(n.`{field}`)
    LIMIT 1
  "
  q=glue::glue(gsub("\\s+", " ", q))
  r=try(neuprintr::neuprint_fetch_custom(q, include_headers = FALSE, cache = cache, conn=NULL, dataset=NULL, ...))
  if(inherits(r, 'try-error')) NA_character_
  else unlist(r$data, use.names = F)
}

The answers obviously differ

> sapply(mnf, neuprint_typeof)
       bodyId           pre          post      upstream    downstream 
    "numeric"     "numeric"     "numeric"     "numeric"     "numeric" 
       status   statusLabel          size       roiInfo          gaba 
  "character"   "character"     "numeric"   "character"     "numeric" 
acetylcholine     glutamate       neither       cropped      instance 
    "numeric"     "numeric"     "numeric"     "logical"   "character" 
     somaSide         class      subclass         group      rootSide 
  "character"   "character"   "character"     "numeric"   "character" 
     position   hemilineage     exitNerve somaNeuromere  somaLocation 
  "character"   "character"   "character"   "character"   "character" 
   somaRadius  rootLocation   description     birthtime          type 
    "numeric"   "character"   "character"   "character"   "character" 
    timeStamp     longTract 
  "character"   "character" 

> sapply(mnf, neuprint_typeof2)
         bodyId             pre            post        upstream      downstream 
      "INTEGER"       "INTEGER"       "INTEGER"       "INTEGER"       "INTEGER" 
         status     statusLabel            size         roiInfo            gaba 
       "STRING"        "STRING"       "INTEGER"        "STRING"         "FLOAT" 
  acetylcholine       glutamate         neither         cropped        instance 
        "FLOAT"         "FLOAT"         "FLOAT"       "BOOLEAN"        "STRING" 
       somaSide           class        subclass           group        rootSide 
       "STRING"        "STRING"        "STRING"       "INTEGER"        "STRING" 
       position     hemilineage       exitNerve   somaNeuromere    somaLocation 
   "PointValue"        "STRING"        "STRING"        "STRING"    "PointValue" 
     somaRadius    rootLocation     description       birthtime            type 
        "FLOAT"    "PointValue"        "STRING"        "STRING"        "STRING" 
      timeStamp       longTract 
"LocalDateTime"        "STRING"

In this naive approach the timing just seems to be down to minimum neo4j execution times from Cam to Janelia (about 3s for all fields, about 100ms each query).

jefferis commented 2 years ago

Closed by #152