javirandor / passgpt

Other
47 stars 13 forks source link

Adaptation of GuidedPasswordGeneration.ipynb for terminal script #2

Open semeniak1997 opened 1 year ago

semeniak1997 commented 1 year ago

Hello, Javi Rando! I have adapted and optimized GuidedPasswordGeneration.ipynb to run from the terminal. Let it be generate_conditional.py:

import os
import argparse
import torch
from transformers import GPT2LMHeadModel
from transformers import RobertaTokenizerFast
import string

def get_tokens(tokenizer, symbols):
    return tokenizer(symbols, add_special_tokens=False).input_ids

def create_token_dict(tokenizer):
    lowercase = list(string.ascii_lowercase)
    uppercase = list(string.ascii_uppercase)
    digits = list(string.digits)
    punctuation = list(string.punctuation)

    lowercase_tokens = get_tokens(tokenizer, lowercase)
    uppercase_tokens = get_tokens(tokenizer, uppercase)
    digits_tokens = get_tokens(tokenizer, digits)
    punctuation_tokens = get_tokens(tokenizer, punctuation)

    return {
        "l": lowercase_tokens,
        "u": uppercase_tokens,
        "d": digits_tokens,
        "p": punctuation_tokens
    }

def conditional_generation(template, num_generations=1):
    generated = 0
    generations = []

    while generated < num_generations:
        generation = torch.tensor([tokenizer.bos_token_id]).unsqueeze(0)
        current_length = 1

        for char in template:
            if char in token_dict:
                bad_tokens = [i for i in all_tokens if i not in token_dict[char]]
            else:
                bad_tokens = [[tokenizer.eos_token_id]]

            generation = model.generate(generation.to(args.device), do_sample=True, max_length=current_length+1, pad_token_id=tokenizer.pad_token_id, num_return_sequences=1,  bad_words_ids=bad_tokens)
            current_length += 1

        if not 2 in generation.flatten():
            generations.append(generation)
            generated += 1

    return torch.cat(generations, 0)[:, 1:]

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Conditional generating passwords using PassGPT.")
    parser.add_argument("--model_path", type=str, help="Path to PassGPT model checkpoint", required=True)
    parser.add_argument("--tokenizer_path", type=str, help="Path to tokenizer checkpoint", required=True)
    parser.add_argument("--device", type=str, default='cuda', help="Device to run execution")
    parser.add_argument("--template", type=str, help="Password template (e.g., 'lluu**dd')")
    parser.add_argument("--maxchars", type=int, default=10, help="Maximum length of the passwords")
    parser.add_argument("--num_generations", type=int, default=1, help="Number of passwords to generate")

    args = parser.parse_args()

    model = GPT2LMHeadModel.from_pretrained(args.model_path).eval().to(args.device)

    tokenizer = RobertaTokenizerFast.from_pretrained(args.tokenizer_path,
                                                    max_len=args.maxchars+2,
                                                    padding="max_length",
                                                    truncation=True,
                                                    do_lower_case=False,
                                                    strip_accents=False,
                                                    mask_token="<mask>",
                                                    unk_token="<unk>",
                                                    pad_token="<pad>",
                                                    truncation_side="right")

    token_dict = create_token_dict(tokenizer)

    all_tokens = [[i] for i in range(len(tokenizer))]

    generations = conditional_generation(args.template, args.num_generations)

    decoded_passwords = tokenizer.batch_decode(generations)
    for i, password in enumerate(decoded_passwords):
        print(f"Generated Password {i+1}: {password}")

Run command, example:

python src/generate_conditional.py --model_path output_dir/last/ --tokenizer_path tokenizers_folder/byte_bpe_tokenizer_99/ --template "ullldp*" --maxchars 10 --num_generations 5

Out:

Generated Password 1: Josi0!M
Generated Password 2: Meek2--
Generated Password 3: Sant0$S
Generated Password 4: Mana1**
Generated Password 5: Tomh8&&
javirandor commented 1 year ago

This code looks good. Thanks a lot! Could you open a Pull Request and I will merge into the existing codebase?