CozySynthesizer / cozy

The collection synthesizer
https://cozy.uwplse.org
Apache License 2.0
209 stars 18 forks source link

examples/basic.ds gets much worse #118

Open ricochet1k opened 4 years ago

ricochet1k commented 4 years ago

I'm really excited about the concept of collection optimization and synthesis, but I'm a little confused about a few things.

First, examples/basic.ds is mentioned in the readme as a good starting point, but why? It seems pretty clear to me (I don't have a proof) that basic.ds cannot get any better. If Cozy (even working correctly) cannot make any improvements to this example, why is it the first example you point people to?

Second, Cozy seems to make it much much worse. I thought it might be a regression, but I've gone very far back into the git history, and it produces garbage like this every time:

Concretization functions:

_var23 : Bag<Int> = l
_var444 : Map<Int, Bool> = EMakeMap2(l, _var427 -> true)
_var10957 : Map<Int, Bool> = EMakeMap2(l, _var3010 -> ((len Filter {_var3011 -> (_var3010 == _var3011)} (l)) <= 1))
_var351979 : Map<Int, Int> = EMakeMap2(l, _var350114 -> (len Filter {_var350115 -> (_var350114 == _var350115)} (l)))
_var372222 : Map<Int, Bag<Int>> = EMakeMap2(Map {_var3011 -> _var3011} (l), _var355118 -> Filter {_var3011 -> (_var3011 == _var355118)} (l))

Basic:
  state _var23 : Bag<Int>
  state _var444 : Map<Int, Bool>
  state _var10957 : Map<Int, Bool>
  state _var351979 : Map<Int, Int>
  state _var372222 : Map<Int, Bag<Int>>

  query elems():
    _var23

  op add(n : Int):
    let _var531203 = (EHasKey(_var444, n) ? [] : [n]);
    let _var531204 = Filter {_var12792 -> ((not EHasKey(_var10957, _var12792)) or (not (EHasKey(_var10957, _var12792) ? (((len Filter {_var3011 -> (_var12792 == _var3011)} (_var23)) <= 1) == EMakeMap2((_var23 + [n]), _var3010 -> ((len Filter {_var3011 -> (_var3010 == _var3011)} ((_var23 + [n]))) <= 1))[_var12792]) : (false == EMakeMap2((_var23 + [n]), _var3010 -> ((len Filter {_var3011 -> (_var3010 == _var3011)} ((_var23 + [n]))) <= 1))[_var12792]))))} (EMapKeys(EMakeMap2((_var23 + [n]), _var3010 -> ((len Filter {_var3011 -> (_var3010 == _var3011)} ((_var23 + [n]))) <= 1))));
    for _var379617 in []:
      del _var372222[_var379617];
    for _var379617 in [n]:
      with _var372222[_var379617] as _var379618:
        for _var384196 in []:
          _var379618.remove(_var384196);
        for _var384196 in [n]:
          _var379618.add(_var384196);
    for _var354039 in []:
      del _var351979[_var354039];
    for _var354039 in [n]:
      with _var351979[_var354039] as _var354040:
        _var354040 = let _var373462 = (_var23 + [n]) in EMakeMap2(_var373462, _var350114 -> (len Filter {_var350115 -> (_var350114 == _var350115)} (_var373462)))[_var354039];
    for _var12792 in []:
      del _var10957[_var12792];
    for _var12792 in _var531204:
      with _var10957[_var12792] as _var12793:
        _var12793 = (not _var12793);
    for _var40 in []:
      _var23.remove(_var40);
    for _var40 in [n]:
      _var23.add(_var40);
    for _var548 in []:
      del _var444[_var548];
    for _var548 in _var531203:
      with _var444[_var548] as _var549:
        _var549 = true;

  op remove(n : Int):
    let _var531205 = Filter {_var364445 -> ((not EHasKey(_var351979, _var364445)) or (not ((_var364445 in _var23) ? ((len Filter {_var350115 -> (_var364445 == _var350115)} (_var23)) == let _var370427 = (_var23 - [n]) in EMakeMap2(_var370427, _var350114 -> (len Filter {_var350115 -> (_var350114 == _var350115)} (_var370427)))[_var364445]) : (0 == let _var370427 = (_var23 - [n]) in EMakeMap2(_var370427, _var350114 -> (len Filter {_var350115 -> (_var350114 == _var350115)} (_var370427)))[_var364445]))))} (EMapKeys(let _var370422 = (_var23 - [n]) in EMakeMap2(_var370422, _var350114 -> (len Filter {_var350115 -> (_var350114 == _var350115)} (_var370422)))));
    let _var531206 = Filter {_var3009 -> _var10957[_var3009]} (Filter {_var3009 -> (_var3009 == n)} (EMapKeys(_var444)));
    let _var531207 = Filter {_var19294 -> (((len _var372222[_var19294]) <= 1) != (true ? ((len Filter {_var3011 -> (_var19294 == _var3011)} ((_var23 - [n]))) <= 1) : false))} (EMapKeys(EMakeMap2((_var23 - [n]), _var3010 -> ((len (_var372222[_var3010] - ((_var3010 == n) ? [n] : []))) <= 1))));
    let _var531208 = Filter {_var3009 -> _var10957[_var3009]} (Filter {_var3009 -> (_var3009 == n)} (EMapKeys(_var444)));
    for _var417575 in Filter {_var3009 -> _var10957[_var3009]} (Filter {_var3009 -> (_var3009 == n)} (EMapKeys(_var444))):
      del _var372222[_var417575];
    for _var417575 in Filter {_var364445 -> ((not EHasKey(_var351979, _var364445)) or (not ((_var364445 in _var23) ? ((len Filter {_var350115 -> (_var364445 == _var350115)} (_var23)) == let _var370427 = (_var23 - [n]) in EMakeMap2(_var370427, _var350114 -> (len Filter {_var350115 -> (_var350114 == _var350115)} (_var370427)))[_var364445]) : (0 == let _var370427 = (_var23 - [n]) in EMakeMap2(_var370427, _var350114 -> (len Filter {_var350115 -> (_var350114 == _var350115)} (_var370427)))[_var364445]))))} (EMapKeys(let _var370422 = (_var23 - [n]) in EMakeMap2(_var370422, _var350114 -> (len Filter {_var350115 -> (_var350114 == _var350115)} (_var370422))))):
      with _var372222[_var417575] as _var417576:
        for _var421706 in [n]:
          _var417576.remove(_var421706);
        for _var421706 in []:
          _var417576.add(_var421706);
    for _var364445 in Filter {_var3009 -> _var10957[_var3009]} (Filter {_var3009 -> (_var3009 == n)} (EMapKeys(_var444))):
      del _var351979[_var364445];
    for _var364445 in _var531205:
      with _var351979[_var364445] as _var364446:
        _var364446 = (_var364446 - 1);
    for _var19294 in _var531206:
      del _var10957[_var19294];
    for _var19294 in _var531207:
      with _var10957[_var19294] as _var19295:
        _var19295 = true;
    for _var89 in (EHasKey(_var444, n) ? [n] : []):
      _var23.remove(_var89);
    for _var89 in []:
      _var23.add(_var89);
    for _var956 in _var531208:
      del _var444[_var956];
    for _var956 in []:
      with _var444[_var956] as _var957:
        pass;

Number of improvements done: 114

What is going on here? Dead code elimination seems to be totally broken (_var23 is the only state used by the query, all the other states and updates to them should be deleted), but also, why are any of these things ever chosen in the first place? Not a single one of them could possibly improve on the efficiency of any of the queries.

This project doesn't seem very active anymore. Is there any more development effort happening? Is there a better channel for communication?

Calvin-L commented 4 years ago

Thanks for reaching out!

You are right, basic.ds is a terrible first example. Cozy's behavior on that simple input is quite surprising. (However, it isn't totally unreasonable; see below.) I will think about what to do, and respond to some of your other points in the meantime.

Cozy seems to make it much much worse

Cozy always prefers to optimize query performance over update performance. Cozy has found the fastest possible implementation of elems(), and it is struggling to improve remove().

In that light, what Cozy is doing is actually sort of reasonable. The synthesizer is trying to avoid the O(n) cost of _var23.remove(...) whenever it can. Element removal from bags has linear complexity, since bags are represented with arrays at code-generation time. Cozy found that it doesn't have to remove elements that are not in the bag:

    for _var89 in (EHasKey(_var444, n) ? [n] : []):
      _var23.remove(_var89);

To do this, Cozy needs _var444, which tracks the set of unique values in the bag. But this creates a new problem: what happens when you remove a copy of a value from the bag? You should remove it from _var444 if it was the last copy, and keep it if there are others. Cozy introduces _var10957 to track which values have exactly one copy in the bag so that it can determine what to do to _var444 when you remove an element. Then it realizes it needs _var351979 to track how many copies there are of each element. Finally, _var372222 does actually look spurious to me; hopefully with a few hours more synthesis time Cozy would realize it doesn't need that one.

Many of the insanely long let _varABC = ... lines in the output you included actually have very simple implementations, but it will take Cozy quite a while to discover those simple implementations.

If you add a precondition to remove that says you promise never to try removing elements that aren't in the bag (assume n in l;), then Cozy realizes it can't avoid the removal and will terminate almost immediately with an implementation identical to the input specification.

Dead code elimination seems to be totally broken (_var23 is the only state used by the query, all the other states and updates to them should be deleted)

See above: Cozy introduces the extra members to help implement remove. For instance, _var444 is used directly to determine whether to skip the call to _var23.remove(_var89).

This project doesn't seem very active anymore. Is there any more development effort happening?

Day-to-day development has mostly stopped since I no longer work on Cozy full-time. For the past two years or so @izgzhen has been keeping the lights on and prototyping some new ideas.

I have some ambitious ideas for improvements, but right now (as you discovered) Cozy is very unpredictable and remains more "research prototype" than "useful tool". :(

Is there a better channel for communication?

I watch GitHub like a hawk, so you've found the best channel for communication.

ricochet1k commented 4 years ago

Wow, okay, thanks for your explanation. I didn't realize Bag was so inefficient.

If I was trying to optimize this in the same way, I would immediately reach for something like Map<T, Int> where the value is the number of copies, along with a special iterator that replicates each key as many times as necessary. I'm aware that that would take more space if all the elements are unique.

So, it seems like a good solution would be to just give it a better implementation of Bag?

Calvin-L commented 4 years ago

Agreed, that is likely the best path forward.

(A bit more context: as tool implementor, I often had to choose whether to spend my time improving Cozy's core synthesis algorithm or giving it more built-in knowledge. I have always fallen on the side of improving the synthesis algorithm rather than giving Cozy a better data structure library, and that may have been the wrong choice. It would be nice to spend a lot of time thinking deeply about Cozy's data structure library if I ever get a chance.)