Adding backend.parameters support in equijoin

It would be very useful (and trivial to implement) support for backend.parameters in equijoin.

It would look something like this (I changed some formatting for my own ease of understanding the code).

equijoin = function(
  left.input = NULL, 
  right.input = NULL, 
  input = NULL, 
  output = NULL, 
  input.format = "native",
  output.format = "native",
  outer = c("", "left", "right", "full"), 
  map.left = to.map(identity), 
  map.right = to.map(identity), 
  reduce  = reduce.default, 
  backend.parameters = NULL) {

  stopifnot(
    xor(
      !is.null(left.input), !is.null(input) &&
        (is.null(left.input) == is.null(right.input))))

  outer = match.arg(outer)
  left.outer = outer == "left"
  right.outer = outer == "right"
  full.outer = outer == "full"

  if (is.null(left.input)) {
    left.input = input
  }

  mark.side = function(kv, is.left) {
    kv = split.keyval(kv)
    keyval(keys(kv),
           lapply(values(kv),
                  function(v) {
                    list(val = v, is.left = is.left)}))
  }

  prefix.cmp = function(l,r) suppressWarnings(
    min(
      which(!(strsplit(l,split="")[[1]] == strsplit(r, split = "")[[1]]))
    )
  )

  is.left.side = function(left.input, right.input) {
    li = rmr.normalize.path(to.dfs.path(left.input))
    ri = rmr.normalize.path(to.dfs.path(right.input))
    ci = rmr.normalize.path(current.input())
    prefix.cmp(ci, li) > prefix.cmp(ci, ri)
  }

  reduce.split = function(vv) {
    tapply(
      vv, 
      sapply(vv, function(v) v$is.left), 
      function(v) lapply(v, function(x)x$val), 
      simplify = FALSE)
  }

  pad.side = function(vv, outer) 
    if (length(vv) == 0 && (outer)) c(NA) else c.or.rbind(vv)

  map = {
    if (is.null(input)) {
      function(k, v) {
        ils = is.left.side(left.input, right.input)
        mark.side(if(ils) map.left(k, v) else map.right(k, v), ils)
      }
    }
    else {
      function(k, v) {
        c.keyval(mark.side(map.left(k, v), TRUE), 
                 mark.side(map.right(k, v), FALSE))
      }
    }
  }

  wrap.if.outer = function(x) 
    if(outer == "") x else list(x)

  reduce.default = function(k, vl, vr) {
    if((is.list(vl) && !is.data.frame(vl)) || 
         (is.list(vr) && !is.data.frame(vr)))
      list(left = vl, right = vr)
    else{
      vl = as.data.frame(vl)
      vr = as.data.frame(vr)
      names(vl) = paste(names(vl), "l", sep = ".")
      names(vr) = paste(names(vr), "r", sep = ".")
      if(all(is.na(vl))) wrap.if.outer(vr)
      else {
        if(all(is.na(vr))) wrap.if.outer(vl)
        else
          wrap.if.outer(merge(vl, vr, by = NULL))
      }
    }
  }

  eqj.reduce = function(k, vv) {
    rs = reduce.split(vv)
    left.side = pad.side(rs$`TRUE`, right.outer || full.outer)
    right.side = pad.side(rs$`FALSE`, left.outer || full.outer)
    if(!is.null(left.side) && !is.null(right.side))
      reduce(k[[1]], left.side, right.side)
  }

  out = mapreduce(
    map = map, 
    reduce = eqj.reduce,
    input = c(left.input, right.input), 
    output = output,
    input.format = input.format,
    output.format = output.format,
    backend.parameters=backend.parameters,)

  if(outer == "") 
    out
  else {
    template = values(
      from.dfs(
        mapreduce(
          out, 
          map = function(k,v) keyval(1, plyr::rbind.fill(v)[1,]),
          reduce = function(k,v) keyval(1, plyr::rbind.fill(v)[1,]),
          combine = TRUE, 
          backend.parameters=backend.parameters)
      )
    )
    mapreduce(
      out,
      map = function(k,v) plyr::rbind.fill(c(v, list(template[NULL,]))), 
      backend.parameters=backend.parameters
    )
  }
}

Thanks for your suggestion. I am about the releases 3.2.0 that requires multiple jobs for outer joins. Can you extend your approach to that case?

On Thu, Aug 14, 2014 at 7:58 AM, Saar Golde notifications@github.com wrote:

It would be very useful (and trivial to implement) support for backend.parameters in equijoin.

It would look something like this (I changed some formatting for my own ease of understanding the code).

equijoin = function( left.input = NULL, right.input = NULL, input = NULL, output = NULL, input.format = "native", output.format = "native", outer = c("", "left", "right", "full"), map.left = to.map(identity), map.right = to.map(identity), reduce = reduce.default, backend.parameters = NULL) {

stopifnot( xor( !is.null(left.input), !is.null(input) && (is.null(left.input) == is.null(right.input))))

outer = match.arg(outer) left.outer = outer == "left" right.outer = outer == "right" full.outer = outer == "full"

if (is.null(left.input)) { left.input = input }

mark.side = function(kv, is.left) { kv = split.keyval(kv) keyval(keys(kv), lapply(values(kv), function(v) { list(val = v, is.left = is.left)})) }

prefix.cmp = function(l,r) suppressWarnings( min( which(!(strsplit(l,split="")[[1]] == strsplit(r, split = "")[[1]])) ) )

is.left.side = function(left.input, right.input) { li = rmr.normalize.path(to.dfs.path(left.input)) ri = rmr.normalize.path(to.dfs.path(right.input)) ci = rmr.normalize.path(current.input()) prefix.cmp(ci, li) > prefix.cmp(ci, ri) }

reduce.split = function(vv) { tapply( vv, sapply(vv, function(v) v$is.left), function(v) lapply(v, function(x)x$val), simplify = FALSE) }

pad.side = function(vv, outer) if (length(vv) == 0 && (outer)) c(NA) else c.or.rbind(vv)

map = { if (is.null(input)) { function(k, v) { ils = is.left.side(left.input, right.input) mark.side(if(ils) map.left(k, v) else map.right(k, v), ils) } } else { function(k, v) { c.keyval(mark.side(map.left(k, v), TRUE), mark.side(map.right(k, v), FALSE)) } } }

wrap.if.outer = function(x) if(outer == "") x else list(x)

reduce.default = function(k, vl, vr) { if((is.list(vl) && !is.data.frame(vl)) || (is.list(vr) && !is.data.frame(vr))) list(left = vl, right = vr) else{ vl = as.data.frame(vl) vr = as.data.frame(vr) names(vl) = paste(names(vl), "l", sep = ".") names(vr) = paste(names(vr), "r", sep = ".") if(all(is.na(vl))) wrap.if.outer(vr) else { if(all(is.na(vr))) wrap.if.outer(vl) else wrap.if.outer(merge(vl, vr, by = NULL)) } } }

eqj.reduce = function(k, vv) { rs = reduce.split(vv) left.side = pad.side(rs$TRUE, right.outer || full.outer) right.side = pad.side(rs$FALSE, left.outer || full.outer) if(!is.null(left.side) && !is.null(right.side)) reduce(k[[1]], left.side, right.side) }

out = mapreduce( map = map, reduce = eqj.reduce, input = c(left.input, right.input), output = output, input.format = input.format, output.format = output.format, backend.parameters=backend.parameters,)

if(outer == "") out else { template = values( from.dfs( mapreduce( out, map = function(k,v) keyval(1, plyr::rbind.fill(v)[1,]), reduce = function(k,v) keyval(1, plyr::rbind.fill(v)[1,]), combine = TRUE, backend.parameters=backend.parameters) ) ) mapreduce( out, map = function(k,v) plyr::rbind.fill(c(v, list(template[NULL,]))), backend.parameters=backend.parameters ) } }

— Reply to this email directly or view it on GitHub https://github.com/RevolutionAnalytics/rmr2/issues/131.

Will take me a couple of days, probably not worth postponing a release...

Now that 3.2.0 is out, we have inner joins in 1 job and outer ones in three. Sounds terrible but the supplemental jobs are relatively lightweight. Now I am not sure to which job the backend.parameters should apply in the case of an outer join and I am wary of adding a triple backend.parameters argument -- I should have said loath. Could you describe your use case, maybe that will clarify things. What are you trying to control?

A couple of things:

Job name, for easier tracking in the logs (mapred.job.name). This one is conceptually the one that fits the least with the rmr.options version of controlling parameters. It pains me to see 'streamjob9165662662418455416.jar' sitting among all the other informative job names. And beyond the aesthetic aspect, it helps me when I make a change and want to look back at how long it took to run a couple of days ago...
java options and memory options (mapreduce.map.memory.mb, mapreduce.map.java.opts and the same ones for reducers), which for memory intensive jobs might need special attention / deviation from the default. If the new outer join has three mapreduce jobs and a couple of them are light, I would think it might make sense to allow people to control the resources separately for the three.

I'm not 100% sure why the job-specific backend parameters is such a bad thing - it is using 'native' streaming options, and for some of them it just makes sense to have job-specific parameters.

On Wed, Aug 27, 2014 at 6:52 PM, Antonio Piccolboni < notifications@github.com> wrote:

Now that 3.2.0 is out, we have inner joins in 1 job and outer ones in three. Sounds terrible but the supplemental jobs are relatively lightweight. Now I am not sure to which job the backend.parameters should apply in the case of an outer join and I am wary of adding a triple backend.parameters argument -- I should have said loath. Could you describe your use case, maybe that will clarify things. What are you trying to control?

— Reply to this email directly or view it on GitHub https://github.com/RevolutionAnalytics/rmr2/issues/131#issuecomment-53652137 .

That's really helpful

On Thu, Aug 28, 2014 at 4:30 AM, Saar Golde notifications@github.com wrote:

A couple of things:

Job name, for easier tracking in the logs (mapred.job.name). This one is conceptually the one that fits the least with the rmr.options version of controlling parameters. It pains me to see 'streamjob9165662662418455416.jar' sitting among all the other informative job names. And beyond the aesthetic aspect, it helps me when I make a change and want to look back at how long it took to run a couple of days ago...

Got it. Still doesn't solve the multi-job equijoin but let me think about it.

java options and memory options (mapreduce.map.memory.mb, mapreduce.map.java.opts and the same ones for reducers), which for memory intensive jobs might need special attention / deviation from the default. If the new outer join has three mapreduce jobs and a couple of them are light, I would think it might make sense to allow people to control the resources separately for the three.

That's a lot of API!

I'm not 100% sure why the job-specific backend parameters is such a bad thing - it is using 'native' streaming options, and for some of them it just makes sense to have job-specific parameters.

In particular, you should have been at my desk when I had a user pass input or output or map as backend specific option. And I have to support that! In general, API design is an art and not a well understood problem, but take a look at this presentation notes ( http://limist.com/coding/talk-notes-how-to-design-a-good-api-and-why-it-matters-josh-bloch.html) particularly section II.3 and I.4 The idea was always that rmr would have multiple backends that are semantically equivalent, change only in performance, scalability in particular. If the API refers to the backend in any way, that seems unlikely. Compromise: backend parameters are possible but can be safely ignored with only performance consequences. The job name, I think that is not a performance issue but a monitoring issue, so it doesn't change the semantics, so I think it's an OK usage. New compromise could be: backend parameters can be safely ignored as far as semantics (argument-value relation) is concerned. Performance, monitoring, debugging, are fair game. The other problem is that we have only two backends and one is much more important than the other as opposed to multiple swappable backends.

On Wed, Aug 27, 2014 at 6:52 PM, Antonio Piccolboni < notifications@github.com> wrote:

Now that 3.2.0 is out, we have inner joins in 1 job and outer ones in three. Sounds terrible but the supplemental jobs are relatively lightweight. Now I am not sure to which job the backend.parameters should apply in the case of an outer join and I am wary of adding a triple backend.parameters argument -- I should have said loath. Could you describe your use case, maybe that will clarify things. What are you trying to control?

— Reply to this email directly or view it on GitHub < https://github.com/RevolutionAnalytics/rmr2/issues/131#issuecomment-53652137>

.

— Reply to this email directly or view it on GitHub https://github.com/RevolutionAnalytics/rmr2/issues/131#issuecomment-53707668 .

It makes sense.

My only long-term suggestion would be calling backend.parameters an unsupported feature rather than a feature for advanced users only. Psychologically, a feature for advanced uses only makes me want to use it more, not less... Might alleviate some obscure user support requests.

As for three backend parameter lists of outer equijoin: a named list of backend.parameter lists should be something that advanced users (or users of unsupported features, if you feel my previous comment makes sense) should be able to handle.

My 2 cents - it's a tough call.

On Thu, Aug 28, 2014 at 1:41 PM, Antonio Piccolboni < notifications@github.com> wrote:

That's really helpful

On Thu, Aug 28, 2014 at 4:30 AM, Saar Golde notifications@github.com wrote:

A couple of things:

Job name, for easier tracking in the logs (mapred.job.name). This one is conceptually the one that fits the least with the rmr.options version of controlling parameters. It pains me to see 'streamjob9165662662418455416.jar' sitting among all the other informative job names. And beyond the aesthetic aspect, it helps me when I make a change and want to look back at how long it took to run a couple of days ago...

Got it. Still doesn't solve the multi-job equijoin but let me think about it.

java options and memory options (mapreduce.map.memory.mb, mapreduce.map.java.opts and the same ones for reducers), which for memory intensive jobs might need special attention / deviation from the default. If the new outer join has three mapreduce jobs and a couple of them are light, I would think it might make sense to allow people to control the resources separately for the three.

That's a lot of API!

I'm not 100% sure why the job-specific backend parameters is such a bad thing - it is using 'native' streaming options, and for some of them it just makes sense to have job-specific parameters.

In particular, you should have been at my desk when I had a user pass input or output or map as backend specific option. And I have to support that! In general, API design is an art and not a well understood problem, but take a look at this presentation notes (

http://limist.com/coding/talk-notes-how-to-design-a-good-api-and-why-it-matters-josh-bloch.html)

particularly section II.3 and I.4 The idea was always that rmr would have multiple backends that are semantically equivalent, change only in performance, scalability in particular. If the API refers to the backend in any way, that seems unlikely. Compromise: backend parameters are possible but can be safely ignored with only performance consequences. The job name, I think that is not a performance issue but a monitoring issue, so it doesn't change the semantics, so I think it's an OK usage. New compromise could be: backend parameters can be safely ignored as far as semantics (argument-value relation) is concerned. Performance, monitoring, debugging, are fair game. The other problem is that we have only two backends and one is much more important than the other as opposed to multiple swappable backends.

On Wed, Aug 27, 2014 at 6:52 PM, Antonio Piccolboni < notifications@github.com> wrote:

Now that 3.2.0 is out, we have inner joins in 1 job and outer ones in three. Sounds terrible but the supplemental jobs are relatively lightweight. Now I am not sure to which job the backend.parameters should apply in the case of an outer join and I am wary of adding a triple backend.parameters argument -- I should have said loath. Could you describe your use case, maybe that will clarify things. What are you trying to control?

— Reply to this email directly or view it on GitHub <

https://github.com/RevolutionAnalytics/rmr2/issues/131#issuecomment-53652137>

.

— Reply to this email directly or view it on GitHub < https://github.com/RevolutionAnalytics/rmr2/issues/131#issuecomment-53707668>

.

— Reply to this email directly or view it on GitHub https://github.com/RevolutionAnalytics/rmr2/issues/131#issuecomment-53763649 .

RevolutionAnalytics / rmr2

Adding backend.parameters support in equijoin #131