Open szemyd opened 2 years ago
Something like this:
#%% from transformers import pipeline, AutoModel, AutoTokenizer from explore_init import ce_init from utils import get_logger #%% logger = get_logger() ce, df = ce_init() code = 'def hello_s(s: str):\n print(f"Hello s")' #%% tokenizer = AutoTokenizer.from_pretrained( "microsoft/codebert-base", truncate=True, max_length=3 ) model = AutoModel.from_pretrained("microsoft/codebert-base") p = pipeline("feature-extraction", model=model, tokenizer=tokenizer) # %% # https://github.com/microsoft/CodeBERT code_tokens = tokenizer.tokenize(code, truncation=True, max_length=512) tokens = [tokenizer.sep_token]+code_tokens+[tokenizer.sep_token] tokens_ids=tokenizer.convert_tokens_to_ids(tokens) tokens_ids # %% import torch context_embeddings=model(torch.tensor(tokens_ids)[None,:])[0] context_embeddings
I believe I'll be able to merge this to the codebase tomorrow
Something like this:
I believe I'll be able to merge this to the codebase tomorrow