Open Apoorv7092 opened 11 months ago
@vince62s It would be a really big help if you can look into this or at least guide
As the message says, it is not in the supported models list. Maybe you can try to see how close it is from another model and then the loader can be adapted.
@vince62s Thank for the quick reply!!
I did manage to write an adapter for Jais(below is the code), and managed to convert the model but while running the inference via ct2, I ran into cublas_not_supported error (which basically indicates to faulty matmul in between the layers, hence faulty conversion). Here is the adapter class code:
@register_loader("JAISConfig")
class JaisLoader(ModelLoader):
@property
def architecture_name(self):
return "AutoModelForCausalLM"
def get_model_spec(self, model):
num_layers = model.config.num_hidden_layers
num_heads = model.config.num_attention_heads
num_heads_kv = getattr(model.config, "num_key_value_heads", num_heads)
if num_heads_kv == num_heads:
num_heads_kv = None
rope_scaling = getattr(model.config, "rope_scaling", None)
if rope_scaling:
rotary_scaling_type = _SUPPORTED_ROPE_SCALING.get(rope_scaling["type"])
rotary_scaling_factor = rope_scaling["factor"]
if rotary_scaling_type is None:
raise NotImplementedError(
"RoPE scaling type '%s' is not yet implemented. "
"The following RoPE scaling types are currently supported: %s"
% (rope_scaling["type"], ", ".join(_SUPPORTED_ROPE_SCALING.keys()))
)
else:
rotary_scaling_type = None
rotary_scaling_factor = 1
spec = transformer_spec.TransformerDecoderModelSpec.from_config(
num_layers,
num_heads,
activation=common_spec.Activation.SWISH,
pre_norm=True,
# ffn_glu=True,
# rms_norm=True,
alibi=True,
alibi_use_positive_positions=True,
scale_alibi=True,
# rotary_dim=0,
# rotary_interleave=False,
# rotary_scaling_type=rotary_scaling_type,
# rotary_scaling_factor=rotary_scaling_factor,
# rotary_base=getattr(model.config, "rope_theta", 10000),
num_heads_kv=num_heads_kv,
)
self.set_decoder(spec.decoder, model.transformer)
self.set_linear(spec.decoder.projection, model.lm_head)
return spec
def get_vocabulary(self, model, tokenizer):
tokens = super().get_vocabulary(model, tokenizer)
extra_ids = model.config.vocab_size - len(tokens)
for i in range(extra_ids):
tokens.append("<extra_id_%d>" % i)
return tokens
def set_vocabulary(self, spec, tokens):
spec.register_vocabulary(tokens)
def set_config(self, config, model, tokenizer):
config.bos_token = tokenizer.bos_token
config.eos_token = tokenizer.eos_token
config.unk_token = tokenizer.unk_token
config.layer_norm_epsilon = model.config.layer_norm_epsilon
def set_layer_norm(self, spec, layer_norm):
spec.gamma = layer_norm.weight
spec.beta = layer_norm.bias
def set_position_encodings(self, spec, module):
spec.encodings = module.slopes
offset = getattr(module, "offset", 0)
if offset > 0:
spec.encodings = spec.encodings[offset:]
def set_decoder(self, spec, module):
spec.scale_embeddings = False
self.set_embeddings(spec.embeddings, module.wte)
for layer_spec, layer in zip(spec.layer, module.h):
self.set_layer_norm(layer_spec.self_attention.layer_norm, layer.ln_1)
self.set_linear(layer_spec.self_attention.linear[0], layer.attn.c_attn)
self.set_linear(layer_spec.self_attention.linear[1], layer.attn.c_proj)
self.set_layer_norm(layer_spec.ffn.layer_norm, layer.ln_2)
split_layers = [common_spec.LinearSpec() for _ in range(2)]
self.set_linear(split_layers[0], layer.mlp.c_fc)
self.set_linear(split_layers[1], layer.mlp.c_fc2)
utils.fuse_linear(layer_spec.ffn.linear_0, split_layers)
self.set_linear(layer_spec.ffn.linear_1, layer.mlp.c_proj)
delattr(layer, "attn")
delattr(layer, "mlp")
gc.collect()
self.set_layer_norm(spec.layer_norm, module.ln_f)
here is the actual model architecture for reference:
JAISModel(
(wte): Embedding(84992, 5120)
(drop): Dropout(p=0.0, inplace=False)
(h): ModuleList(
(0-39): 40 x JAISBlock(
(ln_1): LayerNorm((5120,), eps=1e-05, elementwise_affine=True)
(attn): JAISAttention(
(c_attn): Conv1D()
(c_proj): Conv1D()
(attn_dropout): Dropout(p=0.0, inplace=False)
(resid_dropout): Dropout(p=0.0, inplace=False)
)
(ln_2): LayerNorm((5120,), eps=1e-05, elementwise_affine=True)
(mlp): JAISMLP(
(c_fc): Conv1D()
(c_fc2): Conv1D()
(c_proj): Conv1D()
(act): SwiGLUActivation()
(dropout): Dropout(p=0.0, inplace=False)
)
)
)
(ln_f): LayerNorm((5120,), eps=1e-05, elementwise_affine=True)
(relative_pe): AlibiPositionEmbeddingLayer()
)
Please do find the time (if you can) and review this and let me know if there are any flaws in this. Thanks in advance!
as you can see in the second print out you have convolution layers so you cannot just copy paste the loader from another regular GPT like model. Have a look at the whisper loader but this is not as straight forward as it may look.
Hello,
Can anyone help me with how to run core42/jais-13b-chat model with ctranslate2? I ran the conversion script but ran into error. Script used:
ct2-transformers-converter --model core42/jais-13b-chat --quantization bfloat16 --output_dir jais-13b-ct2 --trust_remote_code
Error:
ValueError: No conversion is registered for the model configuration JAISConfig (supported configurations are: BartConfig, BertConfig, BloomConfig, CodeGenConfig, DistilBertConfig, FalconConfig, GPT2Config, GPTBigCodeConfig, GPTJConfig, GPTNeoXConfig, LlamaConfig, M2M100Config, MBartConfig, MPTConfig, MT5Config, MarianConfig, MixFormerSequentialConfig, OPTConfig, PegasusConfig, RWConfig, T5Config, WhisperConfig, XLMRobertaConfig)