How to implement a transform that tokenize text?

Luciennnnnnn commented 5 months ago

I have following code:

import dotenv

dotenv.load_dotenv(override=True)

import random

import numpy as np
import torch
import torchvision

from datasets import load_from_disk

from ffcv.writer import DatasetWriter
from ffcv.fields import RGBImageField, BytesField

ffcv_dataset_path = 'dataset.beton'

from ffcv.loader import Loader, OrderOption
from ffcv.fields.decoders import NDArrayDecoder, FloatDecoder, RandomResizedCropRGBImageDecoder, BytesDecoder

from ffcv.transforms import ToTensor, ToTorchImage, NormalizeImage, Convert, ToDevice

class TokenizeText(torch.nn.Module):
    def __init__(self, tokenizer=None, args=None):
        super(TokenizeText, self).__init__()
        self.tokenizer = tokenizer
        self.args = args

    def forward(self, text):
        text = text.tobytes()
        text = text.decode('utf-8')

        if random.random() < self.args.get('proportion_empty_prompts', 1):
            text = ""
            is_null_caption = True
        else:
            is_null_caption = False

        inputs = self.tokenizer(
            text, max_length=self.args.get('max_tokens', self.tokenizer.model_max_length), padding="max_length", truncation=True, return_tensors="pt"
        )
        return inputs.input_ids, is_null_caption

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
    subfolder="tokenizer",
    use_fast=False,
)

loader = Loader(
    ffcv_dataset_path,
    batch_size=8,
    num_workers=8,
    order=OrderOption.RANDOM,
    seed=233,
    drop_last=True,
    os_cache=True,
    distributed=False,
    pipelines={
        "image": [
            RandomResizedCropRGBImageDecoder(output_size=(512, 512)),
            ToTensor(),
            # ToDevice('cuda', non_blocking=True),
            ToTorchImage(),
            # NormalizeImage(mean=np.array([0.0, 0.0, 0.0]), std=np.array([255.0, 255.0, 255.0]), type=np.float32),
            Convert(torch.float32),
            torchvision.transforms.Normalize([0.0, 0.0, 0.0], [255.0, 255.0, 255.0])
        ],
        "text": [
            BytesDecoder(),
            # TokenizeText(tokenizer, {}),
            TokenizeText(),
            # ToTensor(),
            # Convert(torch.float32),
        ],
    },
)

for data in loader:
    # print(data[0].shape)
    # print(data[0])
    print(data[1].shape)
    print(data[1])
    break

However, I encounter following error:

Exception in thread Thread-1:
Traceback (most recent call last):
  File "/home/sist/luoxin/.conda/envs/py3.10+cu118+ffcv/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/home/sist/luoxin/.conda/envs/py3.10+cu118+ffcv/lib/python3.10/site-packages/ffcv/loader/epoch_iterator.py", line 84, in run
    result = self.run_pipeline(b_ix, ixes, slot, events[slot])
  File "/home/sist/luoxin/.conda/envs/py3.10+cu118+ffcv/lib/python3.10/site-packages/ffcv/loader/epoch_iterator.py", line 146, in run_pipeline
    results = stage_code(**args)
  File "", line 2, in stage_code_0
  File "/home/sist/luoxin/.conda/envs/py3.10+cu118+ffcv/lib/python3.10/site-packages/numba/core/dispatcher.py", line 468, in _compile_for_args
    error_rewrite(e, 'typing')
  File "/home/sist/luoxin/.conda/envs/py3.10+cu118+ffcv/lib/python3.10/site-packages/numba/core/dispatcher.py", line 409, in error_rewrite
    raise e.with_traceback(None)
numba.core.errors.TypingError: Failed in nopython mode pipeline (step: nopython frontend)
Untyped global name 'self': Cannot determine Numba type of <class 'ffcv.transforms.module.ModuleWrapper'>

File "../../.conda/envs/py3.10+cu118+ffcv/lib/python3.10/site-packages/ffcv/transforms/module.py", line 25:
        def apply_module(inp, _):
            res = self.module(inp)

tavisshore commented 5 months ago

Could be incompatible python/packages - the official installation still recommends python 3.9, which of course alters the available package versions

Luciennnnnnn commented 5 months ago

Hi @tavisshore , I can install ffcv 1.0.2 successfully, how about the available package versions you mentioned

libffcv / ffcv

How to implement a transform that tokenize text? #378