Open semeniak1997 opened 1 year ago
Hello, Javi Rando! I have adapted and optimized GuidedPasswordGeneration.ipynb to run from the terminal. Let it be generate_conditional.py:
import os import argparse import torch from transformers import GPT2LMHeadModel from transformers import RobertaTokenizerFast import string def get_tokens(tokenizer, symbols): return tokenizer(symbols, add_special_tokens=False).input_ids def create_token_dict(tokenizer): lowercase = list(string.ascii_lowercase) uppercase = list(string.ascii_uppercase) digits = list(string.digits) punctuation = list(string.punctuation) lowercase_tokens = get_tokens(tokenizer, lowercase) uppercase_tokens = get_tokens(tokenizer, uppercase) digits_tokens = get_tokens(tokenizer, digits) punctuation_tokens = get_tokens(tokenizer, punctuation) return { "l": lowercase_tokens, "u": uppercase_tokens, "d": digits_tokens, "p": punctuation_tokens } def conditional_generation(template, num_generations=1): generated = 0 generations = [] while generated < num_generations: generation = torch.tensor([tokenizer.bos_token_id]).unsqueeze(0) current_length = 1 for char in template: if char in token_dict: bad_tokens = [i for i in all_tokens if i not in token_dict[char]] else: bad_tokens = [[tokenizer.eos_token_id]] generation = model.generate(generation.to(args.device), do_sample=True, max_length=current_length+1, pad_token_id=tokenizer.pad_token_id, num_return_sequences=1, bad_words_ids=bad_tokens) current_length += 1 if not 2 in generation.flatten(): generations.append(generation) generated += 1 return torch.cat(generations, 0)[:, 1:] if __name__ == "__main__": parser = argparse.ArgumentParser(description="Conditional generating passwords using PassGPT.") parser.add_argument("--model_path", type=str, help="Path to PassGPT model checkpoint", required=True) parser.add_argument("--tokenizer_path", type=str, help="Path to tokenizer checkpoint", required=True) parser.add_argument("--device", type=str, default='cuda', help="Device to run execution") parser.add_argument("--template", type=str, help="Password template (e.g., 'lluu**dd')") parser.add_argument("--maxchars", type=int, default=10, help="Maximum length of the passwords") parser.add_argument("--num_generations", type=int, default=1, help="Number of passwords to generate") args = parser.parse_args() model = GPT2LMHeadModel.from_pretrained(args.model_path).eval().to(args.device) tokenizer = RobertaTokenizerFast.from_pretrained(args.tokenizer_path, max_len=args.maxchars+2, padding="max_length", truncation=True, do_lower_case=False, strip_accents=False, mask_token="<mask>", unk_token="<unk>", pad_token="<pad>", truncation_side="right") token_dict = create_token_dict(tokenizer) all_tokens = [[i] for i in range(len(tokenizer))] generations = conditional_generation(args.template, args.num_generations) decoded_passwords = tokenizer.batch_decode(generations) for i, password in enumerate(decoded_passwords): print(f"Generated Password {i+1}: {password}")
Run command, example:
python src/generate_conditional.py --model_path output_dir/last/ --tokenizer_path tokenizers_folder/byte_bpe_tokenizer_99/ --template "ullldp*" --maxchars 10 --num_generations 5
Out:
Generated Password 1: Josi0!M Generated Password 2: Meek2-- Generated Password 3: Sant0$S Generated Password 4: Mana1** Generated Password 5: Tomh8&&
This code looks good. Thanks a lot! Could you open a Pull Request and I will merge into the existing codebase?
Hello, Javi Rando! I have adapted and optimized GuidedPasswordGeneration.ipynb to run from the terminal. Let it be generate_conditional.py:
Run command, example:
Out: