imneme / pcg-cpp

PCG — C++ Implementation
Apache License 2.0
735 stars 98 forks source link

Questions regarding internal state of PCG rng #80

Open Darendis opened 1 year ago

Darendis commented 1 year ago

I have a situation where I need to seed PCG from a random_device, perform some work and if later on if an investigation is needed to be able to generate the same randomness used for the given unit of work.

The following demo code does that and verifies the numbers generated are identical once the rng is reseeded - all that is good or at least seems "ok".

What I'm sort of confused with is the following:

  1. The size, in terms of bytes, of the internal seed changes on every iteration (ranges from 115 to 118 bytes)
  2. The first 39 bytes of the internal state are always the same regardless of iteration or if the process is rerun
i: 003157 state_size: 118 state: 0x34373032363234373638373934323132313834383134343230373439313833373532333532352031363633363437363738363637353336363735323933393837333638303333343331333134323120323333373034333838313634363535373131373135383634353733373431343636353934323032
i: 003158 state_size: 118 state: 0x34373032363234373638373934323132313834383134343230373439313833373532333532352031333030333935393939303235363433383336353032303136363737393735363637323235353920313230343430393334323834373531383136383133303336363435383435353632373833393137
i: 003159 state_size: 117 state: 0x343730323632343736383739343231323138343831343432303734393138333735323335323520363531363934333530333130383735333338303632333836383631373832383139313137393920323431303032363031393131313439313637363732303838353935373731303438343833313838
i: 003160 state_size: 117 state: 0x343730323632343736383739343231323138343831343432303734393138333735323335323520393935313338343230363636323432303630343532303532383139333536323536323231383720323332363033393736333335343535383934393736303732343437363538373339353737373333
i: 003161 state_size: 117 state: 0x343730323632343736383739343231323138343831343432303734393138333735323335323520333731363737353233333733363336363131373535313737323330363435323136303432313120333035303036363937303631363032353336303932373132363137353232333335343535383331
i: 003162 state_size: 118 state: 0x34373032363234373638373934323132313834383134343230373439313833373532333532352032323733313635303232313036373235353833383235313036353236393632363539353630393320313136373832343930373834353031343031343535303536383637313633393931383935323338
i: 003163 state_size: 117 state: 0x343730323632343736383739343231323138343831343432303734393138333735323335323520333236323138353338333539303336313134393633393134343832353439313932333534353631203838353632313331373737373230383139393639373631333233323433303635393832303738
i: 003164 state_size: 117 state: 0x343730323632343736383739343231323138343831343432303734393138333735323335323520313332383339363632303030323939323639383130343330323833363831303337313935333120313534343139303531303332323838343234343635333831353735383939353335363236303436
i: 003165 state_size: 117 state: 0x343730323632343736383739343231323138343831343432303734393138333735323335323520313034323438353737333232323436373138343037343833333634383936363533313639313339203337303138343536333937303235313736383835323038363133373433363832363238393034
i: 003166 state_size: 117 state: 0x343730323632343736383739343231323138343831343432303734393138333735323335323520323534303331333630383639343437303636383330353235323532373135383730333535393733203735373231313839343632343835313233383831363237303831383235363636373932313730
i: 003167 state_size: 117 state: 0x343730323632343736383739343231323138343831343432303734393138333735323335323520383231353738303835313334383637383139313637373731373235313635313139303430393320333334313230303333393734303932363836373031343330313939383130373733323739393530
i: 003168 state_size: 117 state: 0x343730323632343736383739343231323138343831343432303734393138333735323335323520343533333634333931343333343336313631313030333432393031323234363037373133343720323136333430323937393735353037323839323433323138333839353230323739373737343439
i: 003169 state_size: 118 state: 0x34373032363234373638373934323132313834383134343230373439313833373532333532352031343133323930303232343239363239393833373739303432393233363437323232373939303120323132393632303531323938303932303131373732303330303834323832313032363432343837
i: 003170 state_size: 118 state: 0x34373032363234373638373934323132313834383134343230373439313833373532333532352032343237323835353830363434383033363832343732323638323731393930333637343134393720313533363436393633333436373636343930383430363736383331313430343639353135393631
i: 003171 state_size: 117 state: 0x343730323632343736383739343231323138343831343432303734393138333735323335323520363834393934373638343332373537323537363739363839303336383233373135393239363120313330373838363933323238313439333338373834393838333739333938343833313934343532
i: 003172 state_size: 118 state: 0x34373032363234373638373934323132313834383134343230373439313833373532333532352031333733373636333631303336343634333437343832353131343038303531303238323535343520313334303531353339313533353539373536323933393433373634333435323332343831373237
i: 003173 state_size: 118 state: 0x34373032363234373638373934323132313834383134343230373439313833373532333532352032383235303038313431313632353637383236343135343031393631383139333331373739373920323834373539323533303533383235383134373939363737333235333033323036353030363830
i: 003174 state_size: 117 state: 0x343730323632343736383739343231323138343831343432303734393138333735323335323520323937373332373030313031383934343031353237323634313330393436393639373936303031203538383233313938323630383430323839323733363135343731323234393931373337303630
i: 003175 state_size: 117 state: 0x343730323632343736383739343231323138343831343432303734393138333735323335323520343039393832343930323034393138303630333633363032393534393732313833353833323120323231353830343932383634313233333330313437313537343935363634373935373639303934
i: 003176 state_size: 117 state: 0x343730323632343736383739343231323138343831343432303734393138333735323335323520353133343735353137353432343537343539333531323030363438383930393139323737303720333037303735353938373035313532343632313130343238333330333535383534363135373034
i: 003177 state_size: 116 state: 0x3437303236323437363837393432313231383438313434323037343931383337353233353235203231383132363332383330303839313736333230303639363033343033323632303631333230392037393333343434393335323837393638373636303731353539363633383830333939353831
i: 003178 state_size: 118 state: 0x34373032363234373638373934323132313834383134343230373439313833373532333532352032363133313632313631333532343338303834383632393638373839313039313130353937343320313532313031343530303337373235383132393035303031393935353531303433313737383333
i: 003179 state_size: 117 state: 0x343730323632343736383739343231323138343831343432303734393138333735323335323520383133393433393235303137303932313434373030353433363837303737373630383136393320313431353536393531303335383533383736333133323238343835323239323333383635333337

Are the above two situations normal or expected behavior?


Example code:

#include <cstdio>
#include <sstream>
#include <string>
#include <random>

#include <pcg/pcg_random.hpp>

std::string str2hex(const std::string& input)
{
    static const char hex_digits[] = "0123456789ABCDEF";
    std::string output;
    output.reserve(input.length() * 2);
    for (const unsigned char c : input)
    {
        output.push_back(hex_digits[c >> 4]);
        output.push_back(hex_digits[c & 15]);
    }
    return output;
}

int main()
{
    pcg_extras::seed_seq_from<std::random_device> seed_source;

    constexpr auto max_num_samples = 10000000;

    for (std::size_t i = 0; i < 1000000000; ++i)
    {
        pcg64 rng(seed_source);

        std::string internal;
        std::stringstream strm;
        strm << rng;
        rng_internal = strm.str();

        printf("i: %06ld state_size: %ld state: 0x%s\n", i, internal.size(), str2hex(internal).c_str());
    }

    return 0;
}
adam4130 commented 1 year ago

This is expected. The implementation of << contains the following (include/pcg_random.hpp:592). Most PCG configurations don't modify the multiplier or increment, so the first part of the output will always be the same.

out << rng.multiplier() << space
    << rng.increment() << space
    << rng.state_;
tbxfreeware commented 1 year ago

O'Neill's implementations of operator>> and operator<< allow one to read and write states between incompatible PCG engines. So long as their multipliers match, for instance, you can write the state from a pcg32_fast engine, and read it back into a pcg32 engine.

This is problematic for several reasons that are listed below, in comments taken from my "annotated" version of pcg_random.hpp.

In the case mentioned above, it is also buggy.

A pcg32_fast engine writes 0 as its increment. This can be read by a pcg32 engine because it allows a variable increment, and does not check the new_increment it reads (even though it should!). The pcg32 engine then right-shifts the new_increment to convert it into a stream, and calls function set_stream to install it. The exact call is rng.set_stream(new_increment >> 1); When the new_increment is 0, however, new_increment >> 1 is also 0, so this call is the same as rng.set_stream(0);.

The final step is to install the new stream by left-shifting, and or-ing in the required 1. Here is the code:

    void set_stream(itype specific_seq)
    {
         inc_ = (specific_seq << 1) | 1;
    }

This has the effect of setting inc_ to 1.

Whoops! We read the increment as 0, and we installed the increment as 1! That's a bug.

The fix is to require, for specific_stream only, that (new_increment & 1u) == 1u. (For other stream_mixins, new_increment must match the existing increment.) A better fix would be to disallow reading and writing between dissimilar PCG engines.

I have written stream I/O functions that correct these problems, and a few others as well. If you would like to see them, let me know.

Here are my notes from the annotated file:

//======================================================================
// Stream Insertion and Extraction Operators
//======================================================================
// Notes by tbxfreeware:
// For the purposes I/O stream insertion and extraction, the "state" of 
// a PCG engine is comprised of three elements:
// 
//   1. multiplier()
//   2. increment()
//   3. state_
// 
// For a given PCG engine, operator<< writes all three to the output 
// stream.
// 
// For a completely different engine, constructed perhaps with a 
// different xtype, itype, output_mixin, output_previous, 
// multiplier_mixin, and/or stream_mixin, operator>> can successfully 
// input the "state" under these conditions:
// 
//   1. Its multiplier() member function returns the same value as the 
//      value read from the input stream.
// 
//   2. Either of two things is true:
//      a. Its stream_mixin is specific_stream, or
//      b. Its increment() member function returns the same value as the 
//         value read from the input stream.
// 
// There are problems with this approach. 
// 
// O'Neill defines stream I/O operators as though their job were the 
// input and output of LCG state. It is not. Their job is to input and 
// output the state of a PCG engine. PCG state is not the same thing 
// as LCG state. 
// 
// Suppose we are given two PCG engines, e1 and e2. When the state of 
// engine e1 is written to an I/O stream, and is then read by engine e2, 
// the two engines should be placed into the same state. They should 
// compare as equal, and they should produce the same sequence of 
// random variates when operater() is called. If these things cannot 
// be made to happen, then the read operation should fail.
// 
// What I'm saying here is that xtype, itype, output_mixin, and 
// output_previous, at least, are part of the state of PCG engine. 
// Matching multiplier() values is also part of the state, 
// and unless the stream_mixin is specific_stream, so is the need 
// to have matching increment() values.
//
// As for xtype and itype, we already know that they are unsigned 
// integral types, so it is enough to record their sizes as part of 
// the state. 
// 
// output_mixin and stream_mixin could have "id" or "tag" fields added 
// to their structs to identify them. Type int would do the job; 
// an enumeration constant, more elegantly.
// 
// Suppose these "global" enumeration classes were defined in namespace pcg: 
// 
// namespace pcg
// {
//     enum class stream_model : int
//     {
//          specific, oneseq, unique, no_stream
//     };
//     enum class output_model : int
//     {
//          xsh_rs, xsh_rr, rxs, rxs_m_xs, rxs_m, dxsm, xsl_rr, xsl_rr_rr, xsh, xsl
//     };
// }
// 
// Each mixin could then define a function that returned an identifying  
// enumeration constant, for example:
// 
// class no_stream
// {
//     auto static stream_model { return pcg::stream_model::no_stream; }
//     ...
// }
// class xsh_rs_mixin
// {
//     auto static output_model { return pcg::output_model::xsh_rs; }
//     ...
// }
// 
// You could not directly read or write the enumeration constants, but 
// they could easily be cast to type int for purposes of stream I/O 
// operations when reading or writing the state.
// 
// Finally, the boolean output_previous could also be converted to int 
// when reading or writing state.
// 
// In conclusion, there are eight fields that need to be written to 
// record the state of a PCG engine:
// 
//  1. sizeof(xtype)
//  2. sizeof(itype)
//  3. static_cast<int>(stream_mixin::stream_model())
//  4. static_cast<int>(output_mixin::output_model())
//  5. static_cast<int>(output_previous)
//  6. +multiplier()     // unary plus forces integral promotion
//  7. +increment()      // obviating the need for custom operator<<
//  8. +state_
// 
// When the state is input, item 6-8 should be read into temporary 
// variables of type uintmax_t or pcg128_t, thus obviating the 
// need for a custom operator>>.
// 
// Successful input should require matching items 1-6 exactly, 
// and sometimes item 7 (for all except stream_model::specific).
// 
// For stream_model::specific, require (increment & 1u) == 1u,
// thus trapping the bug identified above.
// 
// For stream_model::no_stream, fail when the two low-order bits 
// of item 8 are not both 1, i.e., require (state & 3u) == 3u.
//======================================================================
tbxfreeware commented 1 year ago

Here is a short program that verifies the bug described above.

It is a complete program, so you should be able to copy, paste, and compile without any need to fiddle. I had my compiler set to C++14.

#include <cassert>
#include <iostream>
#include <sstream>
#include <type_traits>
#include "pcg_random.hpp"

template< typename charT, typename traits >
bool check_whether_mcg_state_can_be_read_by_pcg32( 
    std::basic_ostream<charT, traits>& log)
{
    using state_type = typename pcg32::state_type;
    enum : state_type { zero, one };

    pcg32 e;
    pcg32_fast e_fast;

    std::stringstream sst;
    sst << e_fast;
    sst >> e;
    auto const pass1{ !sst.fail() };
    assert(pass1);

    sst.clear();
    sst.str("");
    sst << e;
    state_type multiplier{}, increment{}, state{};
    sst >> multiplier >> increment >> state;
    auto const pass2{ increment == one };
    assert(pass2);

    log << "Can the state of pcg32_fast be read in by pcg32?"
        "\nAnswer: " << (pass1 ? "yes\n\n" : "no\n\n");
    if (pass1)
    {
        log << "Does that change the increment from 0 on pcg32_fast "
            "to 1 on pcg32?"
            "\nAnswer: " << (pass2 ? "yes\n\n" : "no\n\n");
    }
    return pass1 && pass2;
}
int main()
{
    auto& log{ std::cout };
    return check_whether_mcg_state_can_be_read_by_pcg32(log) ? 0 : 1;
}
// end file: main.cpp
Darendis commented 10 months ago

thank-you @tbxfreeware, that was a very informative writeup.