Should the attention consider the attention group number?

#find query FFN neurons activating attn neurons curfile_ffn_score_dict = {} for l_h_n_p, increase_score in cur_file_attn_neuron_list_sort[:30]: attn_layer, attn_head, attn_neuron, attn_pos = l_h_n_p.split("_") attn_layer, attn_head, attn_neuron, attn_pos = int(attn_layer), int(attn_head), int(attn_neuron), int(attn_pos) cur_attn_neuron = attn_head*HEAD_DIM+attn_neuron attn_neuron_key = model.model.layers[attn_layer].self_attn.v_proj.weight.data[ cur_attn_neuron] attn_neuron_key_new = attn_neuron_key * model.model.layers[attn_layer].input_layernorm.weight.data cur_inner_all = torch.sum(torch.tensor(all_pos_layer_input[attn_layer][attn_pos])*attn_neuron_key_new, -1) for layer_i in range(attn_layer): cur_layer_neurons = (torch.tensor(all_pos_coefficient_scores[layer_i][attn_pos])*get_fc2_params(model, layer_i)).T cur_layer_neurons_innerproduct = torch.sum(cur_layer_neurons * attn_neuron_key_new, -1)/cur_inner_all for neuron_i in range(len(cur_layer_neurons_innerproduct)): if str(layer_i)+"_"+str(neuron_i) not in curfile_ffn_score_dict: curfile_ffn_score_dict[str(layer_i)+"_"+str(neuron_i)] = 0.0 curfile_ffn_score_dict[str(layer_i)+"_"+str(neuron_i)] += cur_layer_neurons_innerproduct[neuron_i].item() * increase_score

cur_attn_neuron = attn_head*HEAD_DIM+attn_neuron
attn_neuron_key = model.model.layers[attn_layer].self_attn.v_proj.weight.data[ cur_attn_neuron]

For llama3, model.model.layers[attn_layer].self_attn.v_proj.weight.data.shape= 10244096, because num_key_value_groups is 4. So, should cur_attn_neuron = attn_headHEAD_DIM+attn_neuron be: cur_attn_neuron = attn_head%8*HEAD_DIM+attn_neuron

zepingyu0512 / neuron-attribution

Should the attention consider the attention group number? #2