Nyanyan / Egaroucid

Super Strong and Fast Othello AI / Computer Othello Reversi
https://www.egaroucid.nyanyan.dev/
GNU General Public License v3.0
101 stars 11 forks source link

Optimize Generic Flip #254

Closed Nyanyan closed 3 months ago

Nyanyan commented 3 months ago

generic版のflipを高速化する

Nyanyan commented 3 months ago

before ffo40-49

Egaroucid_for_Console.exe -l 60 -nobook -thread 32 -hash 25 -solve problem/ffo40-49.txt 
|          Level|          Depth|           Move|          Score|           Time|          Nodes|            NPS|
|             60|        20@100%|             a2|            +38|  000:00:00.027|       13778717|      510322851|
|             60|        22@100%|             h4|             +0|  000:00:00.082|       30713919|      374559987|
|             60|        22@100%|             g2|             +6|  000:00:00.110|       44439509|      403995536|
|             60|        23@100%|             c7|            -12|  000:00:00.249|      121471782|      487838481|
|             60|        23@100%|             d2|            -14|  000:00:00.057|       15384519|      269903842|
|             60|        24@100%|             b2|             +6|  000:00:00.505|      309139835|      612158089|
|             60|        24@100%|             b3|             -8|  000:00:00.195|       81527384|      418089148|
|             60|        25@100%|             g2|             +4|  000:00:00.146|       75374684|      516264958|
|             60|        25@100%|             f6|            +28|  000:00:00.473|      223350840|      472200507|
|             60|        26@100%|             e1|            +16|  000:00:00.698|      370742498|      531149710|
total 1285923687 nodes in 2.542s NPS 505870844
Nyanyan commented 3 months ago

メモリ参照を使わないで書いてみたが、遅い

Egaroucid_for_Console.exe -l 60 -nobook -thread 32 -hash 25 -solve problem/ffo40-49.txt
|          Level|          Depth|           Move|          Score|           Time|          Nodes|            NPS|
|             60|        20@100%|             a2|            +38|  000:00:00.030|       13099607|      436653566|
|             60|        22@100%|             h4|             +0|  000:00:00.109|       32234340|      295727889|
|             60|        22@100%|             g2|             +6|  000:00:00.133|       41773840|      314089022|
|             60|        23@100%|             c7|            -12|  000:00:00.282|      107729664|      382020085|
|             60|        23@100%|             d2|            -14|  000:00:00.065|       15585394|      239775292|
|             60|        24@100%|             b2|             +6|  000:00:00.629|      307969036|      489616909|
|             60|        24@100%|             b3|             -8|  000:00:00.290|      114248198|      393959303|
|             60|        25@100%|             g2|             +4|  000:00:00.160|       59093201|      369332506|
|             60|        25@100%|             f6|            +28|  000:00:00.594|      223755799|      376693264|
|             60|        26@100%|             e1|            +16|  000:00:00.893|      382967888|      428855417|
total 1298456967 nodes in 3.185s NPS 407678796
Nyanyan commented 3 months ago

全体的に改善の余地はある(例えばgenetric版だとclzが遅いなど)ので、その辺を改善してみると良さそう

Nyanyan commented 3 months ago

clzを組み込みの_lzcnt_u64に置き換えると速くなるので、CLZが遅いので間違いなさそう

Egaroucid_for_Console.exe -l 60 -nobook -thread 32 -hash 25 -solve problem/ffo40-49.txt
|          Level|          Depth|           Move|          Score|           Time|          Nodes|            NPS|
|             60|        20@100%|             a2|            +38|  000:00:00.024|       13603762|      566823416|
|             60|        22@100%|             h4|             +0|  000:00:00.073|       29770152|      407810301|
|             60|        22@100%|             g2|             +6|  000:00:00.089|       47413373|      532734528|
|             60|        23@100%|             c7|            -12|  000:00:00.192|       95624186|      498042635|
|             60|        23@100%|             d2|            -14|  000:00:00.053|       16642411|      314007754|
|             60|        24@100%|             b2|             +6|  000:00:00.441|      306822738|      695743170|
|             60|        24@100%|             b3|             -8|  000:00:00.215|      115481065|      537121232|
|             60|        25@100%|             g2|             +4|  000:00:00.126|       68325712|      542267555|
|             60|        25@100%|             f6|            +28|  000:00:00.406|      206639141|      508963401|
|             60|        26@100%|             e1|            +16|  000:00:00.604|      350398833|      580130518|
total 1250721373 nodes in 2.223s NPS 562627698
Nyanyan commented 3 months ago

多分、以前の実装で良い気がするので戻す。

Nyanyan commented 3 months ago

ビット演算のみでflip計算はこれを使った

inline uint64_t calc_flip(const uint64_t player, const uint64_t opponent, const int place){
    uint64_t line, outflank;
    uint64_t mopponent = opponent & 0x7E7E7E7E7E7E7E7EULL;
    int rplace = HW2_M1 - place;
    pos = place;
    flip = 0;

    // downward
    line = 0x0080808080808080ULL >> rplace;
    outflank = (0x8000000000000000ULL >> clz(~opponent & line)) & player;
    flip |= ((~outflank + 1) * 2) & line;
    // upward
    line = 0x0101010101010100ULL << place;
    outflank = ((opponent | ~line) + 1) & line & player;
    flip |= (outflank - (outflank != 0)) & line;

    // right
    line = 0x7f00000000000000ULL >> rplace;
    outflank = (0x8000000000000000ULL >> clz(~mopponent & line)) & player;
    flip |= ((~outflank + 1) * 2) & line;
    // left
    line = 0x00000000000000feULL << place;
    outflank = ((mopponent | ~line) + 1) & line & player;
    flip |= (outflank - (outflank != 0)) & line;

    // d7 downward
    line = 0x0102040810204000ULL >> rplace;
    outflank = (0x8000000000000000ULL >> clz(~mopponent & line)) & player;
    flip |= ((~outflank + 1) * 2) & line;
    // d7 upward
    line = 0x0002040810204080ULL << place;
    outflank = ((mopponent | ~line) + 1) & line & player;
    flip |= (outflank - (outflank != 0)) & line;

    // d9 downward
    line = 0x0040201008040201ULL >> rplace;
    outflank = (0x8000000000000000ULL >> clz(~mopponent & line)) & player;
    flip |= ((~outflank + 1) * 2) & line;
    // d9 upward
    line = 0x8040201008040200ULL << place;
    outflank = ((mopponent | ~line) + 1) & line & player;
    flip |= (outflank - (outflank != 0)) & line;

    return flip;
}
Nyanyan commented 3 months ago

盤面180度回転を使えばCLZを使わずに済むのでは?

Nyanyan commented 3 months ago

そこそこ速いがまだ遅い

Egaroucid_for_Console.exe -l 60 -nobook -thread 32 -hash 25 -solve problem/ffo40-49.txt
|          Level|          Depth|           Move|          Score|           Time|          Nodes|            NPS|
|             60|        20@100%|             a2|            +38|  000:00:00.032|       13208387|      412762093|
|             60|        22@100%|             h4|             +0|  000:00:00.098|       30914292|      315451959|
|             60|        22@100%|             g2|             +6|  000:00:00.138|       48752638|      353279985|
|             60|        23@100%|             c7|            -12|  000:00:00.266|      107603818|      404525631|
|             60|        23@100%|             b8|            -14|  000:00:00.067|       16600570|      247769701|
|             60|        24@100%|             b2|             +6|  000:00:00.577|      309184035|      535847547|
|             60|        24@100%|             b3|             -8|  000:00:00.277|      112278606|      405337927|
|             60|        25@100%|             g2|             +4|  000:00:00.154|       63370847|      411499006|
|             60|        25@100%|             f6|            +28|  000:00:00.476|      185716723|      390161182|
|             60|        26@100%|             e1|            +16|  000:00:00.766|      351147122|      458416608|
total 1238777038 nodes in 2.851s NPS 434506151
Nyanyan commented 3 months ago

これがメモリ参照なしで最速なので、諦める

inline uint64_t calc_flip(uint64_t player, uint64_t opponent, const int place){
    uint64_t line, outflank;
    uint64_t mopponent = opponent & 0x7e7e7e7e7e7e7e7eULL;
    uint64_t rplayer = rotate_180(player);
    uint64_t ropponent = rotate_180(opponent);
    uint64_t mropponent = ropponent & 0x7e7e7e7e7e7e7e7eULL;
    int rplace = HW2_M1 - place;
    pos = place;
    flip = 0;
    uint64_t rflip = 0;

    // downward
    line = 0x0101010101010100ULL << rplace;
    outflank = ((ropponent | ~line) + 1) & line & rplayer;
    rflip |= (outflank - (outflank != 0)) & line;
    // upward
    line = 0x0101010101010100ULL << place;
    outflank = ((opponent | ~line) + 1) & line & player;
    flip |= (outflank - (outflank != 0)) & line;

    // right
    line = 0x00000000000000feULL << rplace;
    outflank = ((mropponent | ~line) + 1) & line & rplayer;
    rflip |= (outflank - (outflank != 0)) & line;
    // left
    line = 0x00000000000000feULL << place;
    outflank = ((mopponent | ~line) + 1) & line & player;
    flip |= (outflank - (outflank != 0)) & line;

    // d7 downward
    line = 0x0002040810204080ULL << rplace;
    outflank = ((mropponent | ~line) + 1) & line & rplayer;
    rflip |= (outflank - (outflank != 0)) & line;
    // d7 upward
    line = 0x0002040810204080ULL << place;
    outflank = ((mopponent | ~line) + 1) & line & player;
    flip |= (outflank - (outflank != 0)) & line;

    // d9 downward
    line = 0x8040201008040200ULL << rplace;
    outflank = ((mropponent | ~line) + 1) & line & rplayer;
    rflip |= (outflank - (outflank != 0)) & line;
    // d9 upward
    line = 0x8040201008040200ULL << place;
    outflank = ((mopponent | ~line) + 1) & line & player;
    flip |= (outflank - (outflank != 0)) & line;

    flip |= rotate_180(rflip);
    return flip;
}