Any reason not to use `DataFrame.from_csv`?

Great question! The talk was aimed at the very beginner so I just kept it simple for those who know Elixir but not the ML libraries. I did use DataFrame in my original work like you see in the bumblebee guides :)

defmodule Example.Data do
  def get_data(path, tokenizer, opts \\ []) do
    path
    |> Explorer.DataFrame.from_csv!(header: false)
    |> Explorer.DataFrame.rename(["label", "text"])
    |> stream()
    |> tokenize_and_batch(tokenizer, opts[:batch_size], opts[:sequence_length])
  end

  def stream(df) do
    xs = df["text"]
    ys = df["label"]

    xs
    |> Explorer.Series.to_enum()
    |> Stream.zip(Explorer.Series.to_enum(ys))
  end

  def tokenize_and_batch(stream, tokenizer, batch_size, sequence_length) do
    stream
    |> Stream.chunk_every(batch_size)
    |> Stream.map(fn batch ->
      {text, labels} = Enum.unzip(batch)
      tokenized = Bumblebee.apply_tokenizer(tokenizer, text, length: sequence_length)
      {tokenized, Nx.stack(labels)}
    end)
  end
end

toranb / elixirconf2023

Any reason not to use `DataFrame.from_csv`? #1