leo-project / leofs

The LeoFS Storage System
https://leo-project.net/leofs/
Apache License 2.0
1.55k stars 155 forks source link

[leo_redundant_manager] racy accesses to ring/member tables #582

Open mocchira opened 7 years ago

mocchira commented 7 years ago

racy accesses

There are some racy accesses to ring/member tables like the below

that may cause https://github.com/leo-project/leofs/issues/572 or something similar ones.

Not only just fixing those cases but to prevent of recurrence of racy issues as possible I'd like to make the one data access layer that wraps all operations to ring/member tables and other modules have to access ring/member ones only through this layer that makes it easy for maintainers to understand how things goes on also grantee the correctness with minimum effort.

a few NOT atomic functions in leo_clustertbl(member|ring)

like

just makes it atomic.

transaction is not enough?

also now we are using transaction context when interacting with mnesia but sync_transaction is more reliable choice since transaction doesn't grantee some causality of distributed features like spawning a process on some remote node and so on. so doing always right jobs may be difficult for maintainers while taking care such premises. so also I'd like to change all transaction contexts into sync_transaction.

TODO

mocchira commented 7 years ago
mocchira commented 7 years ago

Generic Concept

Rollback Policy

TODO

The list of new I/F

To be (added|modified) on the existing leo_redundant_manager.

%% fix the current impl on chash to be atomic
-spec(range_of_vnodes(TableInfo, VNodeId) ->
             {ok, [tuple()]} | {error, any()} when TableInfo::table_info(),
                                  VNodeId::integer()).

%% fix the current impl around leo_redundant_manager(_api) to be atomic
-spec(rebalance() ->
             {ok, [tuple()]} | {error, any()}).

%% used by worker to sync member,ring information on gen_server with the source(mnesia|ets)
-spec(sync_worker(State) ->
             {ok, #state{}} | {error, any()} when State::#state{}).

%% used by api to sync source(mnesia|ets) with SyncData including the current, prev members
-spec(sync_source(SyncTarget, SyncData, Options) ->
             {ok, [{atom(), any()}]} |
             {error, any()} when SyncTarget::sync_target(),
                                 SyncData::[{atom(), any()}],
                                 Options::[{atom(), any()}]).

%% update the both(cur, prev) of ring tables based on the member table
-spec(create() -> ok).

%% https://github.com/leo-project/leo_redundant_manager/blob/1.9.31/src/leo_redundant_manager.erl#L648-L665
%% fix to be atomic
-spec(attach(Node, AwarenessL2, Clock, NumOfVNodes) ->
             ok | {error, any()} when Node::atom(),
                                      AwarenessL2::string(),
                                      Clock::integer(),
                                      NumOfVNodes::integer()).
-spec(attach(Node, AwarenessL2, Clock, NumOfVNodes, RPCPort) ->
             ok | {error, any()} when Node::atom(),
                                      AwarenessL2::string(),
                                      Clock::integer(),
                                      NumOfVNodes::integer(),
                                      RPCPort::integer()).
-spec(attach(TableInfo, Node, AwarenessL2, Clock, NumOfVNodes, RPCPort) ->
             ok | {error, any()} when TableInfo::ring_table_info(),
                                      Node::atom(),
                                      AwarenessL2::string(),
                                      Clock::integer(),
                                      NumOfVNodes::integer(),
                                      RPCPort::integer()).

%% https://github.com/leo-project/leo_redundant_manager/blob/1.9.31/src/leo_redundant_manager.erl#L670-L682
%% fix to be atomic
-spec(detach(Node, Clock) ->
             ok | {error, any()} when Node::atom(),
                                      Clock::integer()).
-spec(detach(TableInfo, Node, Clock) ->
             ok | {error, any()} when TableInfo::ring_table_info(),
                                      Node::atom(),
                                      Clock::integer()).

%% fix the current impl around leo_redundant_manager_(api|chash) to be atomic
%% when retrieving rings to retrieve both at once(to be) from retrieve each one by one(as is)
-spec(checksum(Type) ->
             {ok, integer()} |
             {ok, {integer(), integer()}} |
             {error, any()} when Type::checksum_type()).

%% fix the current impl around leo_redundant_manager(_api) to be atomic
%% when retrieving members to retrieve both at once(to be) from retrieve each one by one(as is)
-spec(get_all_ver_members() ->
             {ok, {CurMembers, PrevMembers}} |
             {error, Cause} when CurMembers::[#member{}],
                                 PrevMembers::[#member{}],
                                 Cause::any()).

%% fix the current impl around leo_redundant_manager_(api|worker) to be atomic
%% sync both at once(to be) from sync each one by one(as is)
-spec(force_sync_workers() ->
             ok).

%% fix the current impl around leo_redundant_manager(_api) to be atomic
-spec(update_members(Members) ->
             ok | {error, any()} when Members::[#member{}]).

%% fix the current impl around leo_redundant_manager(_api) to be atomic
-spec(update_member_by_node() -> ok).

%% fix the current impl around leo_redundant_manager_api to be atomic
-spec(get_alias(Node, GrpL2) ->
             {ok, tuple()} when Node::atom(),
                                GrpL2::string()).
-spec(get_alias(Table, Node, GrpL2) ->
             {ok, tuple()} when Table::member_table(),
                                Node::atom(),
                                GrpL2::string()).

%% no problem
%% -spec(reserve() -> ok).
%% -spec(suspend() -> ok).
%% -spec(dump() -> ok).
%% -spec(has_member() -> ok).
%% -spec(get_members() -> ok).
%% -spec(get_member_by_node() -> ok).
%% -spec(get_member_by_status() -> ok).
%% -spec(update_member() -> ok).
%% -spec(delete_member_by_node() -> ok).
mocchira commented 7 years ago

@yosukehara I've finished the first version at https://github.com/leo-project/leofs/issues/582#issuecomment-274725931 . Please review.

yosukehara commented 7 years ago

@mocchira I've reviewed https://github.com/leo-project/leofs/issues/582#issuecomment-274725931 . There is no issue, and I'm with you. Do you think you implement that?

mocchira commented 7 years ago

@yosukehara thanks for reviewing. I'll go forward based on this design.

mocchira commented 7 years ago

@yosukehara As I've updated the Rollback Policy section at https://github.com/leo-project/leofs/issues/582#issuecomment-274725931 in order to make it easy to implement rollback on memory information(ets, state on gen_server), PTAL.