Implementing more actions

var R = {}; // the Recurrent library
(function (global) {
    "use strict";
    // Utility fun
    function assert(condition, message) {
        // from http://stackoverflow.com/questions/15313418/javascript-assert
        if (!condition) {
            message = message || "Assertion failed";
            if (typeof Error !== "undefined") {
                throw new Error(message);
            }
            throw message; // Fallback
        }
    }
    // Random numbers utils
    var return_v = false;
    var v_val = 0.0;
    var gaussRandom = function () {
        if (return_v) {
            return_v = false;
            return v_val;
        }
        var u = 2 * Math.random() - 1;
        var v = 2 * Math.random() - 1;
        var r = u * u + v * v;
        if (r == 0 || r > 1) return gaussRandom();
        var c = Math.sqrt(-2 * Math.log(r) / r);
        v_val = v * c; // cache this
        return_v = true;
        return u * c;
    }
    var randf = function (a, b) {
        return Math.random() * (b - a) + a;
    }
    var randi = function (a, b) {
        return Math.floor(Math.random() * (b - a) + a);
    }
    var randn = function (mu, std) {
        return mu + gaussRandom() * std;
    }
    // helper function returns array of zeros of length n
    // and uses typed arrays if available
    var zeros = function (n) {
        if (typeof (n) === 'undefined' || isNaN(n)) {
            return [];
        }
        if (typeof ArrayBuffer === 'undefined') {
            // lacking browser support
            var arr = new Array(n);
            for (var i = 0; i < n; i++) {
                arr[i] = 0;
            }
            return arr;
        } else {
            return new Float64Array(n);
        }
    }
    // Mat holds a matrix
    var Mat = function (n, d) {
        // n is number of rows d is number of columns
        this.n = n;
        this.d = d;
        this.w = zeros(n * d);
        this.dw = zeros(n * d);
    }
    Mat.prototype = {
        get: function (row, col) {
            // slow but careful accessor function
            // we want row-major order
            var ix = (this.d * row) + col;
            assert(ix >= 0 && ix < this.w.length);
            return this.w[ix];
        },
        set: function (row, col, v) {
            // slow but careful accessor function
            var ix = (this.d * row) + col;
            assert(ix >= 0 && ix < this.w.length);
            this.w[ix] = v;
        },
        setFrom: function (arr) {
            for (var i = 0, n = arr.length; i < n; i++) {
                this.w[i] = arr[i];
            }
        },
        setColumn: function (m, i) {
            for (var q = 0, n = m.w.length; q < n; q++) {
                this.w[(this.d * q) + i] = m.w[q];
            }
        },
        toJSON: function () {
            var json = {};
            json['n'] = this.n;
            json['d'] = this.d;
            json['w'] = this.w;
            return json;
        },
        fromJSON: function (json) {
            this.n = json.n;
            this.d = json.d;
            this.w = zeros(this.n * this.d);
            this.dw = zeros(this.n * this.d);
            for (var i = 0, n = this.n * this.d; i < n; i++) {
                this.w[i] = json.w[i]; // copy over weights
            }
        }
    }
    var copyMat = function (b) {
        var a = new Mat(b.n, b.d);
        a.setFrom(b.w);
        return a;
    }
    var copyNet = function (net) {
        // nets are (k,v) pairs with k = string key, v = Mat()
        var new_net = {};
        for (var p in net) {
            if (net.hasOwnProperty(p)) {
                new_net[p] = copyMat(net[p]);
            }
        }
        return new_net;
    }
    var updateMat = function (m, alpha) {
        // updates in place
        for (var i = 0, n = m.n * m.d; i < n; i++) {
            if (m.dw[i] !== 0) {
                m.w[i] += -alpha * m.dw[i];
                m.dw[i] = 0;
            }
        }
    }
    var updateNet = function (net, alpha) {
        for (var p in net) {
            if (net.hasOwnProperty(p)) {
                updateMat(net[p], alpha);
            }
        }
    }
    var netToJSON = function (net) {
        var j = {};
        for (var p in net) {
            if (net.hasOwnProperty(p)) {
                j[p] = net[p].toJSON();
            }
        }
        return j;
    }
    var netFromJSON = function (j) {
        var net = {};
        for (var p in j) {
            if (j.hasOwnProperty(p)) {
                net[p] = new Mat(1, 1); // not proud of this
                net[p].fromJSON(j[p]);
            }
        }
        return net;
    }
    var netZeroGrads = function (net) {
        for (var p in net) {
            if (net.hasOwnProperty(p)) {
                var mat = net[p];
                gradFillConst(mat, 0);
            }
        }
    }
    var netFlattenGrads = function (net) {
        var n = 0;
        for (var p in net) {
            if (net.hasOwnProperty(p)) {
                var mat = net[p];
                n += mat.dw.length;
            }
        }
        var g = new Mat(n, 1);
        var ix = 0;
        for (var p in net) {
            if (net.hasOwnProperty(p)) {
                var mat = net[p];
                for (var i = 0, m = mat.dw.length; i < m; i++) {
                    g.w[ix] = mat.dw[i];
                    ix++;
                }
            }
        }
        return g;
    }
    // return Mat but filled with random numbers from gaussian
    var RandMat = function (n, d, mu, std) {
        var m = new Mat(n, d);
        fillRandn(m, mu, std);
        //fillRand(m,-std,std); // kind of :P
        return m;
    }
    // Mat utils
    // fill matrix with random gaussian numbers
    var fillRandn = function (m, mu, std) {
        for (var i = 0, n = m.w.length; i < n; i++) {
            m.w[i] = randn(mu, std);
        }
    }
    var fillRand = function (m, lo, hi) {
        for (var i = 0, n = m.w.length; i < n; i++) {
            m.w[i] = randf(lo, hi);
        }
    }
    var gradFillConst = function (m, c) {
        for (var i = 0, n = m.dw.length; i < n; i++) {
            m.dw[i] = c
        }
    }
    // Transformer definitions
    var Graph = function (needs_backprop) {
        if (typeof needs_backprop === 'undefined') {
            needs_backprop = true;
        }
        this.needs_backprop = needs_backprop;
        // this will store a list of functions that perform backprop,
        // in their forward pass order. So in backprop we will go
        // backwards and evoke each one
        this.backprop = [];
    }
    Graph.prototype = {
        backward: function () {
            for (var i = this.backprop.length - 1; i >= 0; i--) {
                this.backprop[i](); // tick!
            }
        },
        rowPluck: function (m, ix) {
            // pluck a row of m with index ix and return it as col vector
            assert(ix >= 0 && ix < m.n);
            var d = m.d;
            var out = new Mat(d, 1);
            for (var i = 0, n = d; i < n; i++) {
                out.w[i] = m.w[d * ix + i];
            } // copy over the data
            if (this.needs_backprop) {
                var backward = function () {
                    for (var i = 0, n = d; i < n; i++) {
                        m.dw[d * ix + i] += out.dw[i];
                    }
                }
                this.backprop.push(backward);
            }
            return out;
        },
        tanh: function (m) {
            // tanh nonlinearity
            var out = new Mat(m.n, m.d);
            var n = m.w.length;
            for (var i = 0; i < n; i++) {
                out.w[i] = Math.tanh(m.w[i]);
            }
            if (this.needs_backprop) {
                var backward = function () {
                    for (var i = 0; i < n; i++) {
                        // grad for z = tanh(x) is (1 - z^2)
                        var mwi = out.w[i];
                        m.dw[i] += (1.0 - mwi * mwi) * out.dw[i];
                    }
                }
                this.backprop.push(backward);
            }
            return out;
        },
        sigmoid: function (m) {
            // sigmoid nonlinearity
            var out = new Mat(m.n, m.d);
            var n = m.w.length;
            for (var i = 0; i < n; i++) {
                out.w[i] = sig(m.w[i]);
            }
            if (this.needs_backprop) {
                var backward = function () {
                    for (var i = 0; i < n; i++) {
                        // grad for z = tanh(x) is (1 - z^2)
                        var mwi = out.w[i];
                        m.dw[i] += mwi * (1.0 - mwi) * out.dw[i];
                    }
                }
                this.backprop.push(backward);
            }
            return out;
        },
        relu: function (m) {
            var out = new Mat(m.n, m.d);
            var n = m.w.length;
            for (var i = 0; i < n; i++) {
                out.w[i] = Math.max(0, m.w[i]); // relu
            }
            if (this.needs_backprop) {
                var backward = function () {
                    for (var i = 0; i < n; i++) {
                        m.dw[i] += m.w[i] > 0 ? out.dw[i] : 0.0;
                    }
                }
                this.backprop.push(backward);
            }
            return out;
        },
        mul: function (m1, m2) {
            // multiply matrices m1 * m2
            assert(m1.d === m2.n, 'matmul dimensions misaligned');
            var n = m1.n;
            var d = m2.d;
            var out = new Mat(n, d);
            for (var i = 0; i < m1.n; i++) { // loop over rows of m1
                for (var j = 0; j < m2.d; j++) { // loop over cols of m2
                    var dot = 0.0;
                    for (var k = 0; k < m1.d; k++) { // dot product loop
                        dot += m1.w[m1.d * i + k] * m2.w[m2.d * k + j];
                    }
                    out.w[d * i + j] = dot;
                }
            }
            if (this.needs_backprop) {
                var backward = function () {
                    for (var i = 0; i < m1.n; i++) { // loop over rows of m1
                        for (var j = 0; j < m2.d; j++) { // loop over cols of m2
                            for (var k = 0; k < m1.d; k++) { // dot product loop
                                var b = out.dw[d * i + j];
                                m1.dw[m1.d * i + k] += m2.w[m2.d * k + j] * b;
                                m2.dw[m2.d * k + j] += m1.w[m1.d * i + k] * b;
                            }
                        }
                    }
                }
                this.backprop.push(backward);
            }
            return out;
        },
        add: function (m1, m2) {
            assert(m1.w.length === m2.w.length);
            var out = new Mat(m1.n, m1.d);
            for (var i = 0, n = m1.w.length; i < n; i++) {
                out.w[i] = m1.w[i] + m2.w[i];
            }
            if (this.needs_backprop) {
                var backward = function () {
                    for (var i = 0, n = m1.w.length; i < n; i++) {
                        m1.dw[i] += out.dw[i];
                        m2.dw[i] += out.dw[i];
                    }
                }
                this.backprop.push(backward);
            }
            return out;
        },
        dot: function (m1, m2) {
            // m1 m2 are both column vectors
            assert(m1.w.length === m2.w.length);
            var out = new Mat(1, 1);
            var dot = 0.0;
            for (var i = 0, n = m1.w.length; i < n; i++) {
                dot += m1.w[i] * m2.w[i];
            }
            out.w[0] = dot;
            if (this.needs_backprop) {
                var backward = function () {
                    for (var i = 0, n = m1.w.length; i < n; i++) {
                        m1.dw[i] += m2.w[i] * out.dw[0];
                        m2.dw[i] += m1.w[i] * out.dw[0];
                    }
                }
                this.backprop.push(backward);
            }
            return out;
        },
        eltmul: function (m1, m2) {
            assert(m1.w.length === m2.w.length);
            var out = new Mat(m1.n, m1.d);
            for (var i = 0, n = m1.w.length; i < n; i++) {
                out.w[i] = m1.w[i] * m2.w[i];
            }
            if (this.needs_backprop) {
                var backward = function () {
                    for (var i = 0, n = m1.w.length; i < n; i++) {
                        m1.dw[i] += m2.w[i] * out.dw[i];
                        m2.dw[i] += m1.w[i] * out.dw[i];
                    }
                }
                this.backprop.push(backward);
            }
            return out;
        },
    }
    var softmax = function (m) {
        var out = new Mat(m.n, m.d); // probability volume
        var maxval = -999999;
        for (var i = 0, n = m.w.length; i < n; i++) {
            if (m.w[i] > maxval) maxval = m.w[i];
        }
        var s = 0.0;
        for (var i = 0, n = m.w.length; i < n; i++) {
            out.w[i] = Math.exp(m.w[i] - maxval);
            s += out.w[i];
        }
        for (var i = 0, n = m.w.length; i < n; i++) {
            out.w[i] /= s;
        }
        // no backward pass here needed
        // since we will use the computed probabilities outside
        // to set gradients directly on m
        return out;
    }
    var Solver = function () {
        this.decay_rate = 0.999;
        this.smooth_eps = 1e-8;
        this.step_cache = {};
    }
    Solver.prototype = {
        step: function (model, step_size, regc, clipval) {
            // perform parameter update
            var solver_stats = {};
            var num_clipped = 0;
            var num_tot = 0;
            for (var k in model) {
                if (model.hasOwnProperty(k)) {
                    var m = model[k]; // mat ref
                    if (!(k in this.step_cache)) {
                        this.step_cache[k] = new Mat(m.n, m.d);
                    }
                    var s = this.step_cache[k];
                    for (var i = 0, n = m.w.length; i < n; i++) {
                        // rmsprop adaptive learning rate
                        var mdwi = m.dw[i];
                        s.w[i] = s.w[i] * this.decay_rate + (1.0 - this.decay_rate) * mdwi * mdwi;
                        // gradient clip
                        if (mdwi > clipval) {
                            mdwi = clipval;
                            num_clipped++;
                        }
                        if (mdwi < -clipval) {
                            mdwi = -clipval;
                            num_clipped++;
                        }
                        num_tot++;
                        // update (and regularize)
                        m.w[i] += -step_size * mdwi / Math.sqrt(s.w[i] + this.smooth_eps) - regc * m.w[i];
                        m.dw[i] = 0; // reset gradients for next iteration
                    }
                }
            }
            solver_stats['ratio_clipped'] = num_clipped * 1.0 / num_tot;
            return solver_stats;
        }
    }
    var initLSTM = function (input_size, hidden_sizes, output_size) {
        // hidden size should be a list
        var model = {};
        for (var d = 0; d < hidden_sizes.length; d++) { // loop over depths
            var prev_size = d === 0 ? input_size : hidden_sizes[d - 1];
            var hidden_size = hidden_sizes[d];
            // gates parameters
            model['Wix' + d] = new RandMat(hidden_size, prev_size, 0, 0.08);
            model['Wih' + d] = new RandMat(hidden_size, hidden_size, 0, 0.08);
            model['bi' + d] = new Mat(hidden_size, 1);
            model['Wfx' + d] = new RandMat(hidden_size, prev_size, 0, 0.08);
            model['Wfh' + d] = new RandMat(hidden_size, hidden_size, 0, 0.08);
            model['bf' + d] = new Mat(hidden_size, 1);
            model['Wox' + d] = new RandMat(hidden_size, prev_size, 0, 0.08);
            model['Woh' + d] = new RandMat(hidden_size, hidden_size, 0, 0.08);
            model['bo' + d] = new Mat(hidden_size, 1);
            // cell write params
            model['Wcx' + d] = new RandMat(hidden_size, prev_size, 0, 0.08);
            model['Wch' + d] = new RandMat(hidden_size, hidden_size, 0, 0.08);
            model['bc' + d] = new Mat(hidden_size, 1);
        }
        // decoder params
        model['Whd'] = new RandMat(output_size, hidden_size, 0, 0.08);
        model['bd'] = new Mat(output_size, 1);
        return model;
    }
    var forwardLSTM = function (G, model, hidden_sizes, x, prev) {
        // forward prop for a single tick of LSTM
        // G is graph to append ops to
        // model contains LSTM parameters
        // x is 1D column vector with observation
        // prev is a struct containing hidden and cell
        // from previous iteration
        if (prev == null || typeof prev.h === 'undefined') {
            var hidden_prevs = [];
            var cell_prevs = [];
            for (var d = 0; d < hidden_sizes.length; d++) {
                hidden_prevs.push(new R.Mat(hidden_sizes[d], 1));
                cell_prevs.push(new R.Mat(hidden_sizes[d], 1));
            }
        } else {
            var hidden_prevs = prev.h;
            var cell_prevs = prev.c;
        }
        var hidden = [];
        var cell = [];
        for (var d = 0; d < hidden_sizes.length; d++) {
            var input_vector = d === 0 ? x : hidden[d - 1];
            var hidden_prev = hidden_prevs[d];
            var cell_prev = cell_prevs[d];
            // input gate
            var h0 = G.mul(model['Wix' + d], input_vector);
            var h1 = G.mul(model['Wih' + d], hidden_prev);
            var input_gate = G.sigmoid(G.add(G.add(h0, h1), model['bi' + d]));
            // forget gate
            var h2 = G.mul(model['Wfx' + d], input_vector);
            var h3 = G.mul(model['Wfh' + d], hidden_prev);
            var forget_gate = G.sigmoid(G.add(G.add(h2, h3), model['bf' + d]));
            // output gate
            var h4 = G.mul(model['Wox' + d], input_vector);
            var h5 = G.mul(model['Woh' + d], hidden_prev);
            var output_gate = G.sigmoid(G.add(G.add(h4, h5), model['bo' + d]));
            // write operation on cells
            var h6 = G.mul(model['Wcx' + d], input_vector);
            var h7 = G.mul(model['Wch' + d], hidden_prev);
            var cell_write = G.tanh(G.add(G.add(h6, h7), model['bc' + d]));
            // compute new cell activation
            var retain_cell = G.eltmul(forget_gate, cell_prev); // what do we keep from cell
            var write_cell = G.eltmul(input_gate, cell_write); // what do we write to cell
            var cell_d = G.add(retain_cell, write_cell); // new cell contents
            // compute hidden state as gated, saturated cell activations
            var hidden_d = G.eltmul(output_gate, G.tanh(cell_d));
            hidden.push(hidden_d);
            cell.push(cell_d);
        }
        // one decoder to outputs at end
        var output = G.add(G.mul(model['Whd'], hidden[hidden.length - 1]), model['bd']);
        // return cell memory, hidden representation and output
        return {
            'h': hidden,
            'c': cell,
            'o': output
        };
    }
    var sig = function (x) {
        // helper function for computing sigmoid
        return 1.0 / (1 + Math.exp(-x));
    }
    var maxi = function (w) {
        // argmax of array w
        var maxv = w[0];
        var maxix = 0;
        for (var i = 1, n = w.length; i < n; i++) {
            var v = w[i];
            if (v > maxv) {
                maxix = i;
                maxv = v;
            }
        }
        return maxix;
    }
    var samplei = function (w) {
        // sample argmax from w, assuming w are 
        // probabilities that sum to one
        var r = randf(0, 1);
        var x = 0.0;
        var i = 0;
        while (true) {
            x += w[i];
            if (x > r) {
                return i;
            }
            i++;
        }
        return w.length - 1; // pretty sure we should never get here?
    }
    // various utils
    global.assert = assert;
    global.zeros = zeros;
    global.maxi = maxi;
    global.samplei = samplei;
    global.randi = randi;
    global.randn = randn;
    global.softmax = softmax;
    // classes
    global.Mat = Mat;
    global.RandMat = RandMat;
    global.forwardLSTM = forwardLSTM;
    global.initLSTM = initLSTM;
    // more utils
    global.updateMat = updateMat;
    global.updateNet = updateNet;
    global.copyMat = copyMat;
    global.copyNet = copyNet;
    global.netToJSON = netToJSON;
    global.netFromJSON = netFromJSON;
    global.netZeroGrads = netZeroGrads;
    global.netFlattenGrads = netFlattenGrads;
    // optimization
    global.Solver = Solver;
    global.Graph = Graph;
})(R);
// END OF RECURRENTJS
var RL = {};
(function (global) {
    "use strict";
    // syntactic sugar function for getting default parameter values
    var getopt = function (opt, field_name, default_value) {
        if (typeof opt === 'undefined') {
            return default_value;
        }
        return (typeof opt[field_name] !== 'undefined') ? opt[field_name] : default_value;
    }
    var zeros = R.zeros; // inherit these
    var assert = R.assert;
    var randi = R.randi;
    var randf = R.randf;
    var setConst = function (arr, c) {
        for (var i = 0, n = arr.length; i < n; i++) {
            arr[i] = c;
        }
    }
    var sampleWeighted = function (p) {
        var r = Math.random();
        var c = 0.0;
        for (var i = 0, n = p.length; i < n; i++) {
            c += p[i];
            if (c >= r) {
                return i;
            }
        }
        assert(false, 'wtf');
    }
    // ------
    // AGENTS
    // ------
    // DPAgent performs Value Iteration
    // - can also be used for Policy Iteration if you really wanted to
    // - requires model of the environment :(
    // - does not learn from experience :(
    // - assumes finite MDP :(
    var DPAgent = function (env, opt) {
        this.V = null; // state value function
        this.P = null; // policy distribution \pi(s,a)
        this.env = env; // store pointer to environment
        this.gamma = getopt(opt, 'gamma', 0.75); // future reward discount factor
        this.reset();
    }
    DPAgent.prototype = {
        reset: function () {
            // reset the agent's policy and value function
            this.ns = this.env.getNumStates();
            this.na = this.env.getMaxNumActions();
            this.V = zeros(this.ns);
            this.P = zeros(this.ns * this.na);
            // initialize uniform random policy
            for (var s = 0; s < this.ns; s++) {
                var poss = this.env.allowedActions(s);
                for (var i = 0, n = poss.length; i < n; i++) {
                    this.P[poss[i] * this.ns + s] = 1.0 / poss.length;
                }
            }
        },
        act: function (s) {
            // behave according to the learned policy
            var poss = this.env.allowedActions(s);
            var ps = [];
            for (var i = 0, n = poss.length; i < n; i++) {
                var a = poss[i];
                var prob = this.P[a * this.ns + s];
                ps.push(prob);
            }
            var maxi = sampleWeighted(ps);
            return poss[maxi];
        },
        learn: function () {
            // perform a single round of value iteration
            self.evaluatePolicy(); // writes this.V
            self.updatePolicy(); // writes this.P
        },
        evaluatePolicy: function () {
            // perform a synchronous update of the value function
            var Vnew = zeros(this.ns);
            for (var s = 0; s < this.ns; s++) {
                // integrate over actions in a stochastic policy
                // note that we assume that policy probability mass over allowed actions sums to one
                var v = 0.0;
                var poss = this.env.allowedActions(s);
                for (var i = 0, n = poss.length; i < n; i++) {
                    var a = poss[i];
                    var prob = this.P[a * this.ns + s]; // probability of taking action under policy
                    if (prob === 0) {
                        continue;
                    } // no contribution, skip for speed
                    var ns = this.env.nextStateDistribution(s, a);
                    var rs = this.env.reward(s, a, ns); // reward for s->a->ns transition
                    v += prob * (rs + this.gamma * this.V[ns]);
                }
                Vnew[s] = v;
            }
            this.V = Vnew; // swap
        },
        updatePolicy: function () {
            // update policy to be greedy w.r.t. learned Value function
            for (var s = 0; s < this.ns; s++) {
                var poss = this.env.allowedActions(s);
                // compute value of taking each allowed action
                var vmax, nmax;
                var vs = [];
                for (var i = 0, n = poss.length; i < n; i++) {
                    var a = poss[i];
                    var ns = this.env.nextStateDistribution(s, a);
                    var rs = this.env.reward(s, a, ns);
                    var v = rs + this.gamma * this.V[ns];
                    vs.push(v);
                    if (i === 0 || v > vmax) {
                        vmax = v;
                        nmax = 1;
                    } else if (v === vmax) {
                        nmax += 1;
                    }
                }
                // update policy smoothly across all argmaxy actions
                for (var i = 0, n = poss.length; i < n; i++) {
                    var a = poss[i];
                    this.P[a * this.ns + s] = (vs[i] === vmax) ? 1.0 / nmax : 0.0;
                }
            }
        },
    }
    // QAgent uses TD (Q-Learning, SARSA)
    // - does not require environment model :)
    // - learns from experience :)
    var TDAgent = function (env, opt) {
        this.update = getopt(opt, 'update', 'qlearn'); // qlearn | sarsa
        this.gamma = getopt(opt, 'gamma', 0.75); // future reward discount factor
        this.epsilon = getopt(opt, 'epsilon', 0.1); // for epsilon-greedy policy
        this.alpha = getopt(opt, 'alpha', 0.01); // value function learning rate
        // class allows non-deterministic policy, and smoothly regressing towards the optimal policy based on Q
        this.smooth_policy_update = getopt(opt, 'smooth_policy_update', false);
        this.beta = getopt(opt, 'beta', 0.01); // learning rate for policy, if smooth updates are on
        // eligibility traces
        this.lambda = getopt(opt, 'lambda', 0); // eligibility trace decay. 0 = no eligibility traces used
        this.replacing_traces = getopt(opt, 'replacing_traces', true);
        // optional optimistic initial values
        this.q_init_val = getopt(opt, 'q_init_val', 0);
        this.planN = getopt(opt, 'planN', 0); // number of planning steps per learning iteration (0 = no planning)
        this.Q = null; // state action value function
        this.P = null; // policy distribution \pi(s,a)
        this.e = null; // eligibility trace
        this.env_model_s = null;; // environment model (s,a) -> (s',r)
        this.env_model_r = null;; // environment model (s,a) -> (s',r)
        this.env = env; // store pointer to environment
        this.reset();
    }
    TDAgent.prototype = {
        reset: function () {
            // reset the agent's policy and value function
            this.ns = this.env.getNumStates();
            this.na = this.env.getMaxNumActions();
            this.Q = zeros(this.ns * this.na);
            if (this.q_init_val !== 0) {
                setConst(this.Q, this.q_init_val);
            }
            this.P = zeros(this.ns * this.na);
            this.e = zeros(this.ns * this.na);
            // model/planning vars
            this.env_model_s = zeros(this.ns * this.na);
            setConst(this.env_model_s, -1); // init to -1 so we can test if we saw the state before
            this.env_model_r = zeros(this.ns * this.na);
            this.sa_seen = [];
            this.pq = zeros(this.ns * this.na);
            // initialize uniform random policy
            for (var s = 0; s < this.ns; s++) {
                var poss = this.env.allowedActions(s);
                for (var i = 0, n = poss.length; i < n; i++) {
                    this.P[poss[i] * this.ns + s] = 1.0 / poss.length;
                }
            }
            // agent memory, needed for streaming updates
            // (s0,a0,r0,s1,a1,r1,...)
            this.r0 = null;
            this.s0 = null;
            this.s1 = null;
            this.a0 = null;
            this.a1 = null;
        },
        resetEpisode: function () {
            // an episode finished
        },
        act: function (s) {
            // act according to epsilon greedy policy
            var poss = this.env.allowedActions(s);
            var probs = [];
            for (var i = 0, n = poss.length; i < n; i++) {
                probs.push(this.P[poss[i] * this.ns + s]);
            }
            // epsilon greedy policy
            if (Math.random() < this.epsilon) {
                var a = poss[randi(0, poss.length)]; // random available action
                this.explored = true;
            } else {
                var a = poss[sampleWeighted(probs)];
                this.explored = false;
            }
            // shift state memory
            this.s0 = this.s1;
            this.a0 = this.a1;
            this.s1 = s;
            this.a1 = a;
            return a;
        },
        learn: function (r1) {
            // takes reward for previous action, which came from a call to act()
            if (!(this.r0 == null)) {
                this.learnFromTuple(this.s0, this.a0, this.r0, this.s1, this.a1, this.lambda);
                if (this.planN > 0) {
                    this.updateModel(this.s0, this.a0, this.r0, this.s1);
                    this.plan();
                }
            }
            this.r0 = r1; // store this for next update
        },
        updateModel: function (s0, a0, r0, s1) {
            // transition (s0,a0) -> (r0,s1) was observed. Update environment model
            var sa = a0 * this.ns + s0;
            if (this.env_model_s[sa] === -1) {
                // first time we see this state action
                this.sa_seen.push(a0 * this.ns + s0); // add as seen state
            }
            this.env_model_s[sa] = s1;
            this.env_model_r[sa] = r0;
        },
        plan: function () {
            // order the states based on current priority queue information
            var spq = [];
            for (var i = 0, n = this.sa_seen.length; i < n; i++) {
                var sa = this.sa_seen[i];
                var sap = this.pq[sa];
                if (sap > 1e-5) { // gain a bit of efficiency
                    spq.push({
                        sa: sa,
                        p: sap
                    });
                }
            }
            spq.sort(function (a, b) {
                return a.p < b.p ? 1 : -1
            });
            // perform the updates
            var nsteps = Math.min(this.planN, spq.length);
            for (var k = 0; k < nsteps; k++) {
                // random exploration
                //var i = randi(0, this.sa_seen.length); // pick random prev seen state action
                //var s0a0 = this.sa_seen[i];
                var s0a0 = spq[k].sa;
                this.pq[s0a0] = 0; // erase priority, since we're backing up this state
                var s0 = s0a0 % this.ns;
                var a0 = Math.floor(s0a0 / this.ns);
                var r0 = this.env_model_r[s0a0];
                var s1 = this.env_model_s[s0a0];
                var a1 = -1; // not used for Q learning
                if (this.update === 'sarsa') {
                    // generate random action?...
                    var poss = this.env.allowedActions(s1);
                    var a1 = poss[randi(0, poss.length)];
                }
                this.learnFromTuple(s0, a0, r0, s1, a1, 0); // note lambda = 0 - shouldnt use eligibility trace here
            }
        },
        learnFromTuple: function (s0, a0, r0, s1, a1, lambda) {
            var sa = a0 * this.ns + s0;
            // calculate the target for Q(s,a)
            if (this.update === 'qlearn') {
                // Q learning target is Q(s0,a0) = r0 + gamma * max_a Q[s1,a]
                var poss = this.env.allowedActions(s1);
                var qmax = 0;
                for (var i = 0, n = poss.length; i < n; i++) {
                    var s1a = poss[i] * this.ns + s1;
                    var qval = this.Q[s1a];
                    if (i === 0 || qval > qmax) {
                        qmax = qval;
                    }
                }
                var target = r0 + this.gamma * qmax;
            } else if (this.update === 'sarsa') {
                // SARSA target is Q(s0,a0) = r0 + gamma * Q[s1,a1]
                var s1a1 = a1 * this.ns + s1;
                var target = r0 + this.gamma * this.Q[s1a1];
            }
            if (lambda > 0) {
                // perform an eligibility trace update
                if (this.replacing_traces) {
                    this.e[sa] = 1;
                } else {
                    this.e[sa] += 1;
                }
                var edecay = lambda * this.gamma;
                var state_update = zeros(this.ns);
                for (var s = 0; s < this.ns; s++) {
                    var poss = this.env.allowedActions(s);
                    for (var i = 0; i < poss.length; i++) {
                        var a = poss[i];
                        var saloop = a * this.ns + s;
                        var esa = this.e[saloop];
                        var update = this.alpha * esa * (target - this.Q[saloop]);
                        this.Q[saloop] += update;
                        this.updatePriority(s, a, update);
                        this.e[saloop] *= edecay;
                        var u = Math.abs(update);
                        if (u > state_update[s]) {
                            state_update[s] = u;
                        }
                    }
                }
                for (var s = 0; s < this.ns; s++) {
                    if (state_update[s] > 1e-5) { // save efficiency here
                        this.updatePolicy(s);
                    }
                }
                if (this.explored && this.update === 'qlearn') {
                    // have to wipe the trace since q learning is off-policy :(
                    this.e = zeros(this.ns * this.na);
                }
            } else {
                // simpler and faster update without eligibility trace
                // update Q[sa] towards it with some step size
                var update = this.alpha * (target - this.Q[sa]);
                this.Q[sa] += update;
                this.updatePriority(s0, a0, update);
                // update the policy to reflect the change (if appropriate)
                this.updatePolicy(s0);
            }
        },
        updatePriority: function (s, a, u) {
            // used in planning. Invoked when Q[sa] += update
            // we should find all states that lead to (s,a) and upgrade their priority
            // of being update in the next planning step
            u = Math.abs(u);
            if (u < 1e-5) {
                return;
            } // for efficiency skip small updates
            if (this.planN === 0) {
                return;
            } // there is no planning to be done, skip.
            for (var si = 0; si < this.ns; si++) {
                // note we are also iterating over impossible actions at all states,
                // but this should be okay because their env_model_s should simply be -1
                // as initialized, so they will never be predicted to point to any state
                // because they will never be observed, and hence never be added to the model
                for (var ai = 0; ai < this.na; ai++) {
                    var siai = ai * this.ns + si;
                    if (this.env_model_s[siai] === s) {
                        // this state leads to s, add it to priority queue
                        this.pq[siai] += u;
                    }
                }
            }
        },
        updatePolicy: function (s) {
            var poss = this.env.allowedActions(s);
            // set policy at s to be the action that achieves max_a Q(s,a)
            // first find the maxy Q values
            var qmax, nmax;
            var qs = [];
            for (var i = 0, n = poss.length; i < n; i++) {
                var a = poss[i];
                var qval = this.Q[a * this.ns + s];
                qs.push(qval);
                if (i === 0 || qval > qmax) {
                    qmax = qval;
                    nmax = 1;
                } else if (qval === qmax) {
                    nmax += 1;
                }
            }
            // now update the policy smoothly towards the argmaxy actions
            var psum = 0.0;
            for (var i = 0, n = poss.length; i < n; i++) {
                var a = poss[i];
                var target = (qs[i] === qmax) ? 1.0 / nmax : 0.0;
                var ix = a * this.ns + s;
                if (this.smooth_policy_update) {
                    // slightly hacky :p
                    this.P[ix] += this.beta * (target - this.P[ix]);
                    psum += this.P[ix];
                } else {
                    // set hard target
                    this.P[ix] = target;
                }
            }
            if (this.smooth_policy_update) {
                // renomalize P if we're using smooth policy updates
                for (var i = 0, n = poss.length; i < n; i++) {
                    var a = poss[i];
                    this.P[a * this.ns + s] /= psum;
                }
            }
        }
    }
    var DQNAgent = function (env, opt) {
        this.gamma = getopt(opt, 'gamma', 0.75); // future reward discount factor
        this.epsilon = getopt(opt, 'epsilon', 0.1); // for epsilon-greedy policy
        this.alpha = getopt(opt, 'alpha', 0.01); // value function learning rate
        this.experience_add_every = getopt(opt, 'experience_add_every', 25); // number of time steps before we add another experience to replay memory
        this.experience_size = getopt(opt, 'experience_size', 5000); // size of experience replay
        this.learning_steps_per_iteration = getopt(opt, 'learning_steps_per_iteration', 10);
        this.tderror_clamp = getopt(opt, 'tderror_clamp', 1.0);
        this.num_hidden_units = getopt(opt, 'num_hidden_units', 100);
        this.env = env;
        this.reset();
    }
    DQNAgent.prototype = {
        reset: function () {
            this.nh = this.num_hidden_units; // number of hidden units
            this.ns = this.env.getNumStates();
            this.na = this.env.getMaxNumActions();
            // nets are hardcoded for now as key (str) -> Mat
            // not proud of this. better solution is to have a whole Net object
            // on top of Mats, but for now sticking with this
            this.net = {};
            this.net.W1 = new R.RandMat(this.nh, this.ns, 0, 0.01);
            this.net.b1 = new R.Mat(this.nh, 1, 0, 0.01);
            this.net.W2 = new R.RandMat(this.na, this.nh, 0, 0.01);
            this.net.b2 = new R.Mat(this.na, 1, 0, 0.01);
            this.exp = []; // experience
            this.expi = 0; // where to insert
            this.t = 0;
            this.r0 = null;
            this.s0 = null;
            this.s1 = null;
            this.a0 = null;
            this.a1 = null;
            this.tderror = 0; // for visualization only...
        },
        toJSON: function () {
            // save function
            var j = {};
            j.nh = this.nh;
            j.ns = this.ns;
            j.na = this.na;
            j.net = R.netToJSON(this.net);
            return j;
        },
        fromJSON: function (j) {
            // load function
            this.nh = j.nh;
            this.ns = j.ns;
            this.na = j.na;
            this.net = R.netFromJSON(j.net);
        },
        forwardQ: function (net, s, needs_backprop) {
            var G = new R.Graph(needs_backprop);
            var a1mat = G.add(G.mul(net.W1, s), net.b1);
            var h1mat = G.tanh(a1mat);
            var a2mat = G.add(G.mul(net.W2, h1mat), net.b2);
            this.lastG = G; // back this up. Kind of hacky isn't it
            return a2mat;
        },
        act: function (slist) {
            // convert to a Mat column vector
            var s = new R.Mat(this.ns, 1);
            s.setFrom(slist);
            // epsilon greedy policy
            if (Math.random() < this.epsilon) {
                var a = randi(0, this.na);
            } else {
                // greedy wrt Q function
                var amat = this.forwardQ(this.net, s, false);
                var a = R.maxi(amat.w); // returns index of argmax action
            }
            // shift state memory
            this.s0 = this.s1;
            this.a0 = this.a1;
            this.s1 = s;
            this.a1 = a;
            return a;
        },
        learn: function (r1) {
            // perform an update on Q function
            if (!(this.r0 == null) && this.alpha > 0) {
                // learn from this tuple to get a sense of how "surprising" it is to the agent
                var tderror = this.learnFromTuple(this.s0, this.a0, this.r0, this.s1, this.a1);
                this.tderror = tderror; // a measure of surprise
                // decide if we should keep this experience in the replay
                if (this.t % this.experience_add_every === 0) {
                    this.exp[this.expi] = [this.s0, this.a0, this.r0, this.s1, this.a1];
                    this.expi += 1;
                    if (this.expi > this.experience_size) {
                        this.expi = 0;
                    } // roll over when we run out
                }
                this.t += 1;
                // sample some additional experience from replay memory and learn from it
                for (var k = 0; k < this.learning_steps_per_iteration; k++) {
                    var ri = randi(0, this.exp.length); // todo: priority sweeps?
                    var e = this.exp[ri];
                    this.learnFromTuple(e[0], e[1], e[2], e[3], e[4])
                }
            }
            this.r0 = r1; // store for next update
        },
        learnFromTuple: function (s0, a0, r0, s1, a1) {
            // want: Q(s,a) = r + gamma * max_a' Q(s',a')
            // compute the target Q value
            var tmat = this.forwardQ(this.net, s1, false);
            var qmax = r0 + this.gamma * tmat.w[R.maxi(tmat.w)];
            // now predict
            var pred = this.forwardQ(this.net, s0, true);
            var tderror = pred.w[a0] - qmax;
            var clamp = this.tderror_clamp;
            if (Math.abs(tderror) > clamp) { // huber loss to robustify
                if (tderror > clamp) tderror = clamp;
                if (tderror < -clamp) tderror = -clamp;
            }
            pred.dw[a0] = tderror;
            this.lastG.backward(); // compute gradients on net params
            // update net
            R.updateNet(this.net, this.alpha);
            return tderror;
        }
    }
    // buggy implementation, doesnt work...
    var SimpleReinforceAgent = function (env, opt) {
        this.gamma = getopt(opt, 'gamma', 0.5); // future reward discount factor
        this.epsilon = getopt(opt, 'epsilon', 0.75); // for epsilon-greedy policy
        this.alpha = getopt(opt, 'alpha', 0.001); // actor net learning rate
        this.beta = getopt(opt, 'beta', 0.01); // baseline net learning rate
        this.env = env;
        this.reset();
    }
    SimpleReinforceAgent.prototype = {
        reset: function () {
            this.ns = this.env.getNumStates();
            this.na = this.env.getMaxNumActions();
            this.nh = 100; // number of hidden units
            this.nhb = 100; // and also in the baseline lstm
            this.actorNet = {};
            this.actorNet.W1 = new R.RandMat(this.nh, this.ns, 0, 0.01);
            this.actorNet.b1 = new R.Mat(this.nh, 1, 0, 0.01);
            this.actorNet.W2 = new R.RandMat(this.na, this.nh, 0, 0.1);
            this.actorNet.b2 = new R.Mat(this.na, 1, 0, 0.01);
            this.actorOutputs = [];
            this.actorGraphs = [];
            this.actorActions = []; // sampled ones
            this.rewardHistory = [];
            this.baselineNet = {};
            this.baselineNet.W1 = new R.RandMat(this.nhb, this.ns, 0, 0.01);
            this.baselineNet.b1 = new R.Mat(this.nhb, 1, 0, 0.01);
            this.baselineNet.W2 = new R.RandMat(this.na, this.nhb, 0, 0.01);
            this.baselineNet.b2 = new R.Mat(this.na, 1, 0, 0.01);
            this.baselineOutputs = [];
            this.baselineGraphs = [];
            this.t = 0;
        },
        forwardActor: function (s, needs_backprop) {
            var net = this.actorNet;
            var G = new R.Graph(needs_backprop);
            var a1mat = G.add(G.mul(net.W1, s), net.b1);
            var h1mat = G.tanh(a1mat);
            var a2mat = G.add(G.mul(net.W2, h1mat), net.b2);
            return {
                'a': a2mat,
                'G': G
            }
        },
        forwardValue: function (s, needs_backprop) {
            var net = this.baselineNet;
            var G = new R.Graph(needs_backprop);
            var a1mat = G.add(G.mul(net.W1, s), net.b1);
            var h1mat = G.tanh(a1mat);
            var a2mat = G.add(G.mul(net.W2, h1mat), net.b2);
            return {
                'a': a2mat,
                'G': G
            }
        },
        act: function (slist) {
            // convert to a Mat column vector
            var s = new R.Mat(this.ns, 1);
            s.setFrom(slist);
            // forward the actor to get action output
            var ans = this.forwardActor(s, true);
            var amat = ans.a;
            var ag = ans.G;
            this.actorOutputs.push(amat);
            this.actorGraphs.push(ag);
            // forward the baseline estimator
            var ans = this.forwardValue(s, true);
            var vmat = ans.a;
            var vg = ans.G;
            this.baselineOutputs.push(vmat);
            this.baselineGraphs.push(vg);
            // sample action from the stochastic gaussian policy
            var a = R.copyMat(amat);
            var gaussVar = 0.02;
            a.w[0] = R.randn(0, gaussVar);
            a.w[1] = R.randn(0, gaussVar);
            this.actorActions.push(a);
            // shift state memory
            this.s0 = this.s1;
            this.a0 = this.a1;
            this.s1 = s;
            this.a1 = a;
            return a;
        },
        learn: function (r1) {
            // perform an update on Q function
            this.rewardHistory.push(r1);
            var n = this.rewardHistory.length;
            var baselineMSE = 0.0;
            var nup = 100; // what chunk of experience to take
            var nuse = 80; // what chunk to update from
            if (n >= nup) {
                // lets learn and flush
                // first: compute the sample values at all points
                var vs = [];
                for (var t = 0; t < nuse; t++) {
                    var mul = 1;
                    // compute the actual discounted reward for this time step
                    var V = 0;
                    for (var t2 = t; t2 < n; t2++) {
                        V += mul * this.rewardHistory[t2];
                        mul *= this.gamma;
                        if (mul < 1e-5) {
                            break;
                        } // efficiency savings
                    }
                    // get the predicted baseline at this time step
                    var b = this.baselineOutputs[t].w[0];
                    for (var i = 0; i < this.na; i++) {
                        // [the action delta] * [the desirebility]
                        var update = -(V - b) * (this.actorActions[t].w[i] - this.actorOutputs[t].w[i]);
                        if (update > 0.1) {
                            update = 0.1;
                        }
                        if (update < -0.1) {
                            update = -0.1;
                        }
                        this.actorOutputs[t].dw[i] += update;
                    }
                    var update = -(V - b);
                    if (update > 0.1) {
                        update = 0.1;
                    }
                    if (update < 0.1) {
                        update = -0.1;
                    }
                    this.baselineOutputs[t].dw[0] += update;
                    baselineMSE += (V - b) * (V - b);
                    vs.push(V);
                }
                baselineMSE /= nuse;
                // backprop all the things
                for (var t = 0; t < nuse; t++) {
                    this.actorGraphs[t].backward();
                    this.baselineGraphs[t].backward();
                }
                R.updateNet(this.actorNet, this.alpha); // update actor network
                R.updateNet(this.baselineNet, this.beta); // update baseline network
                // flush
                this.actorOutputs = [];
                this.rewardHistory = [];
                this.actorActions = [];
                this.baselineOutputs = [];
                this.actorGraphs = [];
                this.baselineGraphs = [];
                this.tderror = baselineMSE;
            }
            this.t += 1;
            this.r0 = r1; // store for next update
        },
    }
    // buggy implementation as well, doesn't work
    var RecurrentReinforceAgent = function (env, opt) {
        this.gamma = getopt(opt, 'gamma', 0.5); // future reward discount factor
        this.epsilon = getopt(opt, 'epsilon', 0.1); // for epsilon-greedy policy
        this.alpha = getopt(opt, 'alpha', 0.001); // actor net learning rate
        this.beta = getopt(opt, 'beta', 0.01); // baseline net learning rate
        this.env = env;
        this.reset();
    }
    RecurrentReinforceAgent.prototype = {
        reset: function () {
            this.ns = this.env.getNumStates();
            this.na = this.env.getMaxNumActions();
            this.nh = 40; // number of hidden units
            this.nhb = 40; // and also in the baseline lstm
            this.actorLSTM = R.initLSTM(this.ns, [this.nh], this.na);
            this.actorG = new R.Graph();
            this.actorPrev = null;
            this.actorOutputs = [];
            this.rewardHistory = [];
            this.actorActions = [];
            this.baselineLSTM = R.initLSTM(this.ns, [this.nhb], 1);
            this.baselineG = new R.Graph();
            this.baselinePrev = null;
            this.baselineOutputs = [];
            this.t = 0;
            this.r0 = null;
            this.s0 = null;
            this.s1 = null;
            this.a0 = null;
            this.a1 = null;
        },
        act: function (slist) {
            // convert to a Mat column vector
            var s = new R.Mat(this.ns, 1);
            s.setFrom(slist);
            // forward the LSTM to get action distribution
            var actorNext = R.forwardLSTM(this.actorG, this.actorLSTM, [this.nh], s, this.actorPrev);
            this.actorPrev = actorNext;
            var amat = actorNext.o;
            this.actorOutputs.push(amat);
            // forward the baseline LSTM
            var baselineNext = R.forwardLSTM(this.baselineG, this.baselineLSTM, [this.nhb], s, this.baselinePrev);
            this.baselinePrev = baselineNext;
            this.baselineOutputs.push(baselineNext.o);
            // sample action from actor policy
            var gaussVar = 0.05;
            var a = R.copyMat(amat);
            for (var i = 0, n = a.w.length; i < n; i++) {
                a.w[0] += R.randn(0, gaussVar);
                a.w[1] += R.randn(0, gaussVar);
            }
            this.actorActions.push(a);
            // shift state memory
            this.s0 = this.s1;
            this.a0 = this.a1;
            this.s1 = s;
            this.a1 = a;
            return a;
        },
        learn: function (r1) {
            // perform an update on Q function
            this.rewardHistory.push(r1);
            var n = this.rewardHistory.length;
            var baselineMSE = 0.0;
            var nup = 100; // what chunk of experience to take
            var nuse = 80; // what chunk to also update
            if (n >= nup) {
                // lets learn and flush
                // first: compute the sample values at all points
                var vs = [];
                for (var t = 0; t < nuse; t++) {
                    var mul = 1;
                    var V = 0;
                    for (var t2 = t; t2 < n; t2++) {
                        V += mul * this.rewardHistory[t2];
                        mul *= this.gamma;
                        if (mul < 1e-5) {
                            break;
                        } // efficiency savings
                    }
                    var b = this.baselineOutputs[t].w[0];
                    // todo: take out the constants etc.
                    for (var i = 0; i < this.na; i++) {
                        // [the action delta] * [the desirebility]
                        var update = -(V - b) * (this.actorActions[t].w[i] - this.actorOutputs[t].w[i]);
                        if (update > 0.1) {
                            update = 0.1;
                        }
                        if (update < -0.1) {
                            update = -0.1;
                        }
                        this.actorOutputs[t].dw[i] += update;
                    }
                    var update = -(V - b);
                    if (update > 0.1) {
                        update = 0.1;
                    }
                    if (update < 0.1) {
                        update = -0.1;
                    }
                    this.baselineOutputs[t].dw[0] += update;
                    baselineMSE += (V - b) * (V - b);
                    vs.push(V);
                }
                baselineMSE /= nuse;
                this.actorG.backward(); // update params! woohoo!
                this.baselineG.backward();
                R.updateNet(this.actorLSTM, this.alpha); // update actor network
                R.updateNet(this.baselineLSTM, this.beta); // update baseline network
                // flush
                this.actorG = new R.Graph();
                this.actorPrev = null;
                this.actorOutputs = [];
                this.rewardHistory = [];
                this.actorActions = [];
                this.baselineG = new R.Graph();
                this.baselinePrev = null;
                this.baselineOutputs = [];
                this.tderror = baselineMSE;
            }
            this.t += 1;
            this.r0 = r1; // store for next update
        },
    }
    // Currently buggy implementation, doesnt work
    var DeterministPG = function (env, opt) {
        this.gamma = getopt(opt, 'gamma', 0.5); // future reward discount factor
        this.epsilon = getopt(opt, 'epsilon', 0.5); // for epsilon-greedy policy
        this.alpha = getopt(opt, 'alpha', 0.001); // actor net learning rate
        this.beta = getopt(opt, 'beta', 0.01); // baseline net learning rate
        this.env = env;
        this.reset();
    }
    DeterministPG.prototype = {
        reset: function () {
            this.ns = this.env.getNumStates();
            this.na = this.env.getMaxNumActions();
            this.nh = 100; // number of hidden units
            // actor
            this.actorNet = {};
            this.actorNet.W1 = new R.RandMat(this.nh, this.ns, 0, 0.01);
            this.actorNet.b1 = new R.Mat(this.nh, 1, 0, 0.01);
            this.actorNet.W2 = new R.RandMat(this.na, this.ns, 0, 0.1);
            this.actorNet.b2 = new R.Mat(this.na, 1, 0, 0.01);
            this.ntheta = this.na * this.ns + this.na; // number of params in actor
            // critic
            this.criticw = new R.RandMat(1, this.ntheta, 0, 0.01); // row vector
            this.r0 = null;
            this.s0 = null;
            this.s1 = null;
            this.a0 = null;
            this.a1 = null;
            this.t = 0;
        },
        forwardActor: function (s, needs_backprop) {
            var net = this.actorNet;
            var G = new R.Graph(needs_backprop);
            var a1mat = G.add(G.mul(net.W1, s), net.b1);
            var h1mat = G.tanh(a1mat);
            var a2mat = G.add(G.mul(net.W2, h1mat), net.b2);
            return {
                'a': a2mat,
                'G': G
            }
        },
        act: function (slist) {
            // convert to a Mat column vector
            var s = new R.Mat(this.ns, 1);
            s.setFrom(slist);
            // forward the actor to get action output
            var ans = this.forwardActor(s, false);
            var amat = ans.a;
            var ag = ans.G;
            // sample action from the stochastic gaussian policy
            var a = R.copyMat(amat);
            if (Math.random() < this.epsilon) {
                var gaussVar = 0.02;
                a.w[0] = R.randn(0, gaussVar);
                a.w[1] = R.randn(0, gaussVar);
            }
            var clamp = 0.25;
            if (a.w[0] > clamp) a.w[0] = clamp;
            if (a.w[0] < -clamp) a.w[0] = -clamp;
            if (a.w[1] > clamp) a.w[1] = clamp;
            if (a.w[1] < -clamp) a.w[1] = -clamp;
            // shift state memory
            this.s0 = this.s1;
            this.a0 = this.a1;
            this.s1 = s;
            this.a1 = a;
            return a;
        },
        utilJacobianAt: function (s) {
            var ujacobian = new R.Mat(this.ntheta, this.na);
            for (var a = 0; a < this.na; a++) {
                R.netZeroGrads(this.actorNet);
                var ag = this.forwardActor(this.s0, true);
                ag.a.dw[a] = 1.0;
                ag.G.backward();
                var gflat = R.netFlattenGrads(this.actorNet);
                ujacobian.setColumn(gflat, a);
            }
            return ujacobian;
        },
        learn: function (r1) {
            // perform an update on Q function
            //this.rewardHistory.push(r1);
            if (!(this.r0 == null)) {
                var Gtmp = new R.Graph(false);
                // dpg update:
                // first compute the features psi:
                // the jacobian matrix of the actor for s
                var ujacobian0 = this.utilJacobianAt(this.s0);
                // now form the features \psi(s,a)
                var psi_sa0 = Gtmp.mul(ujacobian0, this.a0); // should be [this.ntheta x 1] "feature" vector
                var qw0 = Gtmp.mul(this.criticw, psi_sa0); // 1x1
                // now do the same thing because we need \psi(s_{t+1}, \mu\_\theta(s\_t{t+1}))
                var ujacobian1 = this.utilJacobianAt(this.s1);
                var ag = this.forwardActor(this.s1, false);
                var psi_sa1 = Gtmp.mul(ujacobian1, ag.a);
                var qw1 = Gtmp.mul(this.criticw, psi_sa1); // 1x1
                // get the td error finally
                var tderror = this.r0 + this.gamma * qw1.w[0] - qw0.w[0]; // lol
                if (tderror > 0.5) tderror = 0.5; // clamp
                if (tderror < -0.5) tderror = -0.5;
                this.tderror = tderror;
                // update actor policy with natural gradient
                var net = this.actorNet;
                var ix = 0;
                for (var p in net) {
                    var mat = net[p];
                    if (net.hasOwnProperty(p)) {
                        for (var i = 0, n = mat.w.length; i < n; i++) {
                            mat.w[i] += this.alpha * this.criticw.w[ix]; // natural gradient update
                            ix += 1;
                        }
                    }
                }
                // update the critic parameters too
                for (var i = 0; i < this.ntheta; i++) {
                    var update = this.beta * tderror * psi_sa0.w[i];
                    this.criticw.w[i] += update;
                }
            }
            this.r0 = r1; // store for next update
        },
    }
    // exports
    global.DPAgent = DPAgent;
    global.TDAgent = TDAgent;
    global.DQNAgent = DQNAgent;
    //global.SimpleReinforceAgent = SimpleReinforceAgent;
    //global.RecurrentReinforceAgent = RecurrentReinforceAgent;
    //global.DeterministPG = DeterministPG;
})(RL);
var Trevel = {
    //settings you can change
    stop: false,
    maxBet: 0.000005,
    minBet: 0.00000005,
    swap: true,
    betSpeed: 100,//change this on init
    verbose: true,
    isTesting: false,
    //money management
    useKelly: false,//martingale performs better on live account!
    kellyPercent: 5, //can't be more than 100 or less than 1
    useMartingale: true, //if kelly is true this won't work
    martingaleMultiplier: 2,
    //bot settings, these are set automaticcally don't bother
    currentBalance: 0,
    startingBalance: 0,
    betAmount: 0,
    profit: 0,
    totalBets: 0,
    totalWins: 0,
    winRate: 0,
    betHistory: [], //this is a sequence of all winning bets not the sequence of bets we placed
    betOutcomes: [],
    hbProbability: 0,
    lbProbability: 0,
    hbCount: 0,
    lbcount: 0,
    nextBet: "",
    previousReward: 0,
    addBet: function (bet, outcome) {
        if (bet === "LB" && outcome === "Win") {
            Trevel.betHistory.push("LO");
            Trevel.betOutcomes.push("W");
            Trevel.totalWins++;
            Trevel.lbcount++;
        }
        if (bet === "LB" && outcome === "Loose") {
            Trevel.betHistory.push("HI");
            Trevel.hbCount++;
            Trevel.betOutcomes.push("L");
        }
        if (bet === "HB" && outcome === "Win") {
            Trevel.betHistory.push("HI");
            Trevel.totalWins++;
            Trevel.hbCount++;
            Trevel.betOutcomes.push("W");
        }
        if (bet === "HB" && outcome === "Loose") {
            Trevel.betHistory.push("LO");
            Trevel.lbcount++;
            Trevel.betOutcomes.push("L");
        }
        Trevel.totalBets++;
    },
    calculateProbabilities: function () {
        Trevel.hbProbability = Trevel.hbCount / Trevel.betHistory.length;
        Trevel.lbProbability = Trevel.lbcount / Trevel.betHistory.length;
        Trevel.winRate = Trevel.totalWins / Trevel.totalBets;
        if (Trevel.isTesting === false) {
            Trevel.profit = Trevel.getProfit();
        }
    },
    getCurrentBalance: function () {
        return parseFloat($('#balance').html());
    },
    placeHighBet: function () {
        DoubleYourBTC('hi');
    },
    placeLowBet: function () {
        DoubleYourBTC('lo');
    },
    setBetAmount: function (amount) {
        var elem = stake.val(amount);
    },
    setOutcome: function (bet) {
        if (lose.display == 'block') {
            if (chance <= 2) {
                chance = 2;
            }
            else {
                chance -= 0.05;
            }
            payout.val(chance);
            Trevel.addBet(bet, "Loose");

        }
        else if (win.display == 'block') {
            if (chance >= 2.5) {
                chance = 2.5;
            }
            else {
                chance += 0.05;
            }
            payout.val(chance);
            Trevel.addBet(bet, "Win");

        }

    },
    prepareBet: function () {
        Trevel.calculateProbabilities();

        if (Trevel.betHistory.length < 10) {
            if (Trevel.useMartingale === true) {
                if (lose.display == 'block' && parseFloat(stake.val()) * parseFloat(payout.val()) < Trevel.maxBet) {
                    if (stake.val() > 0.00000100) {
                        Trevel.setBetAmount((parseFloat(stake.val()) * parseFloat(payout.val() / 2)).toFixed(8));
                    }
                    else {
                        Trevel.setBetAmount((parseFloat(stake.val()) * parseFloat(payout.val())).toFixed(8));
                    }
                }
                else if (win.display == 'block') {
                    if (parseFloat(stake.val()) / parseFloat(payout.val()) < Trevel.minBet) {
                        Trevel.setBetAmount(Trevel.minBet);
                    }
                    else {
                        if (stake.val() > 0.00000100) {
                            Trevel.setBetAmount(parseFloat(stake.val()) / parseFloat(payout.val() * 2).toFixed(8));
                        }
                        else {
                            Trevel.setBetAmount(parseFloat(stake.val()) / parseFloat(payout.val()).toFixed(8));
                        }
                    }
                }
            }
        }
        else {
            if (Trevel.useKelly === true && Trevel.betHistory.length > 12) {
                Trevel.currentBalance = Trevel.getCurrentBalance();

                var currMulty = parseFloat(payout.val());

                var kellyAmount = (((Trevel.currentBalance * Trevel.kellyPercent) / 100) * ((Trevel.winRate * currMulty - 1)) / (currMulty - 1)).toFixed(8);

                if (kellyAmount > 0 && kellyAmount < Trevel.maxBet) {
                    Trevel.setBetAmount(kellyAmount);
                }
                else {
                    if (parseFloat(stake.val()) / parseFloat(payout.val()) < Trevel.minBet) {
                        Trevel.setBetAmount(Trevel.minBet);
                    }
                    else {
                        if (stake.val() > 0.00000100) {
                            Trevel.setBetAmount(parseFloat(stake.val()) / parseFloat(payout.val() * 2).toFixed(8));

                        }
                        else {
                            Trevel.setBetAmount(parseFloat(stake.val()) / parseFloat(payout.val()).toFixed(8));
                        }
                    }
                }
            }
            else if (Trevel.useMartingale === true && Trevel.betHistory.length > 12) {
                if (lose.display == 'block' && parseFloat(stake.val()) * parseFloat(payout.val()) < Trevel.maxBet) {
                    if (stake.val() > 0.00000100) {
                        Trevel.setBetAmount(parseFloat(stake.val()) * parseFloat(payout.val() / 2).toFixed(8));

                    }
                    else {
                        Trevel.setBetAmount(parseFloat(stake.val()) * parseFloat(payout.val()).toFixed(8));
                    }
                }
                else if (win.display == 'block') {
                    if (parseFloat(stake.val()) / parseFloat(payout.val()) < Trevel.minBet) {
                        Trevel.setBetAmount(Trevel.minBet);
                    }
                    else {
                        if (stake.val() > 0.00000100) {
                            Trevel.setBetAmount(parseFloat(stake.val()) / parseFloat(payout.val() * 2).toFixed(8));

                        }
                        else {
                            Trevel.setBetAmount(parseFloat(stake.val()) / parseFloat(payout.val()).toFixed(8));
                        }
                    }
                }
            }
        }
    },
    placeBet: function () {
        if (Trevel.nextBet === "HB") {
            Trevel.placeHighBet();
        } else if (Trevel.nextBet === "LB") {
            Trevel.placeLowBet();
        } else if (Trevel.betHistory.length > 0 && Trevel.swap === true) {
            var prev = Trevel.betHistory[Trevel.betHistory.length - 1];
            if (prev === "LO") {
                Trevel.placeHighBet();
            } else {
                Trevel.placeLowBet();
            }
        } else {
            Trevel.placeLowBet();
        }
    },
    getProfit: function () {
        return (Trevel.getCurrentBalance() - Trevel.startingBalance).toFixed(8);
    },
    getNumStates: function () {
        return 8;
    },
    getMaxNumActions: function () {
        return 2;
    },
    getSentiment: function (bet) {
        if (bet === "HI") {
            return 1;
        } else {
            return 0;
        }
    },
    getPreviousBets: function () {
        var hist = [];
        if (Trevel.betHistory.length > 12) {
            hist.push(Trevel.getSentiment(Trevel.betHistory[Trevel.betHistory.length - 1]));
            hist.push(Trevel.getSentiment(Trevel.betHistory[Trevel.betHistory.length - 2]));
            hist.push(Trevel.getSentiment(Trevel.betHistory[Trevel.betHistory.length - 3]));
            hist.push(Trevel.getSentiment(Trevel.betHistory[Trevel.betHistory.length - 4]));
            hist.push(Trevel.getSentiment(Trevel.betHistory[Trevel.betHistory.length - 5]));
            hist.push(Trevel.getSentiment(Trevel.betHistory[Trevel.betHistory.length - 6]));
            hist.push(Trevel.getSentiment(Trevel.betHistory[Trevel.betHistory.length - 7]));
            hist.push(Trevel.getSentiment(Trevel.betHistory[Trevel.betHistory.length - 8]));
        } else {
            hist = [0, 1, 0, 1, 0, 1, 0, 1]; //incase we just started...
        }
        return hist;
    },
    getAgentState: function () { //we'll observe the last 8 bets
        var s = Trevel.getPreviousBets();
        return s;
    },
    getReward: function () {
        var reward = 0;
        var out1 = Trevel.betOutcomes[Trevel.betOutcomes.length - 1];
        var out2 = Trevel.betOutcomes[Trevel.betOutcomes.length - 2];
        if (out1 === "L") {
            if (Trevel.previousReward < 0) {
                reward = Trevel.previousReward;
                reward += -0.03;
                if (out2 === "L") {
                    reward += -0.03;
                }
            }
            else {
                reward = -0.03;
                if (out2 === "L") {
                    reward += -0.03;
                }
            }
        }
        else {
            if (Trevel.previousReward > 0) {
                reward = Trevel.previousReward;
                reward += 0.01;
                if (out2 === "W") {
                    reward += 0.01;
                }
            }
            else {
                reward = 0.01;
                if (out2 === "W") {
                    reward += 0.01;
                }
            }
        }
        return reward;
    },
    //for raw testing only
    randomNumber: function (min, max) {
        return Math.floor(Math.random() * (max - min + 1) + min);
    },
    getTestOutcome: function (random) {
        if (random % 2 == 0) {
            return "HI";
        } else {
            return "LO";
        }
    },
    //initialize Trevel
    init: function () {
        Trevel.startingBalance = Trevel.currentBalance = balance;
        Trevel.setBetAmount(Trevel.minBet);
        Trevel.stop = false;
        Trevel.swap = true;
        Trevel.betSpeed = 1000;
    }
};
//Global variables
var balance = function () { return $('#balance').html(); },
    stake = $('#double_your_btc_stake'), //.val()
    payout = $('#double_your_btc_payout_multiplier'), //.val()
    hibet = function () { DoubleYourBTC('hi'); },
    lobet = function () { DoubleYourBTC('lo'); },
    win = document.getElementById('double_your_btc_bet_win').style,
    lose = document.getElementById('double_your_btc_bet_lose').style,
    chance = parseFloat(payout.val());
rString = function (length, chars) {
    var result = '';
    var length = 16;
    var chars = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789abcdefghijklmnopqrstuvwxyz'
    for (var i = length; i > 0; --i) result += chars[Math.floor(Math.random() * chars.length)];
    return result;
},
    rSeed = function () { $('#next_client_seed').val(rString()); };

//Deep Q learning with reinforceJS
var spec = {}
spec.update = 'qlearn';
spec.gamma = 0.9;
spec.epsilon = 0.45;
spec.alpha = 0.01;
spec.experience_add_every = 12;
spec.experience_size = 100000;
spec.learning_steps_per_iteration = 24;
spec.tderror_clamp = 0.7;
spec.num_hidden_units = 24;
// create an environment object
var env = Trevel;
if (env.isTesting === false) {
    env.init();
}
// create the DQN agent
agent = new RL.DQNAgent(env, spec);
setInterval(function () {
    if (env.stop === false) {
        var state = env.getAgentState();
        var action = agent.act(state);
        var outcome = "";
        if (env.isTesting === false) {
            if (action === 0) {
                env.nextBet = "LB";
                env.prepareBet();
                env.placeBet();
                env.setOutcome("LB");
                outcome = env.betOutcomes[env.betOutcomes.length - 1];
            } else if (action === 1) {
                env.nextBet = "HB";
                env.prepareBet();
                env.placeBet();
                env.setOutcome("HB");
                outcome = env.betOutcomes[env.betOutcomes.length - 1];
            }
            if (env.verbose === true) {
                env.calculateProbabilities();
                //console.log("Machine Bet: " + action + "{" + env.nextBet + "} isKelly: " + env.useKelly + " isMartingale: " + env.useMartingale);
                console.log("Profit: " + env.profit + " WinRate: " + (env.winRate * 100).toFixed(2));
                console.log("Balance: " + balance() + " Stake: " + stake.val() + " Payout: " + payout.val());
            }
        } else {
            console.log("Action: " + action);
            var testOutcome = env.getTestOutcome(env.randomNumber(0, 1000));
            if (action === 0 && testOutcome === "LO") {
                env.addBet("LB", "Win");
                outcome = "W";
            } else if (action === 0 && testOutcome === "HI") {
                env.addBet("LB", "Loose");
                outcome = "L";
            } else if (action === 1 && testOutcome === "HI") {
                env.addBet("HB", "Win");
                outcome = "W";
            } else if (action === 1 && testOutcome === "LO") {
                env.addBet("HB", "Loose");
                outcome = "L";
            }
            env.calculateProbabilities();
            console.log("Winrate: " + (env.winRate * 100).toFixed(2));
        }
        var reward = env.getReward();
        if (reward == 0) {
            if (outcome === "L") {
                reward = -0.03;
            } else {
                reward = 0.01;
            }
        }
        agent.learn(reward);
        env.previousReward = reward;
    }
}, env.betSpeed);
There's what I'm asking to do only it doesnt reward the bot. otherwise it works pretty well, with decent profits rather quick.
If you dif this code with the version you posted, youll see I added a few things other than changing how the bet stake and payout multiplier is handled, such as a function that creates a random client seed and changed a few of the html elements to global variables
mbithy / Trevel

Implementing more actions #8