Open Martin-Molinero opened 1 month ago
Added some more time series models
Repo: autogluon/chronos-t5-base. Revisions ['b6748377ca1c242cb95ed1187b8b3fe46942c023']
Repo: cardiffnlp/twitter-roberta-base-sentiment-latest. Revisions ['4ba3d4463bd152c9e4abd892b50844f30c646708']
Repo: mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis. Revisions ['ae0eab9ad336d7d548e0efe394b07c04bcaf6e91']
Repo: amazon/chronos-t5-small. Revisions ['476a71b73e6205f7987e811a81f355b9791c9256']
Repo: autogluon/chronos-t5-large. Revisions ['16dc70e284b7b209340c258c1375dcee93e3a768']
Repo: nickmuchi/sec-bert-finetuned-finance-classification. Revisions ['15cae24ba4089500a7e18f340e0286160b1daf14']
Repo: amazon/chronos-t5-tiny. Revisions ['d968d90a73cc4e3a3103e262d1d895204e74e415']
Repo: yiyanghkust/finbert-tone. Revisions ['4921590d3c0c3832c0efea24c8381ce0bda7844b']
Repo: microsoft/deberta-base. Revisions ['0d1b43ccf21b5acd9f4e5f7b077fa698f05cf195']
Repo: nickmuchi/distilroberta-finetuned-financial-text-classification. Revisions ['396d9c2c093f87875c3fdfa03ad7eed792e776e9']
Repo: FacebookAI/roberta-base. Revisions ['e2da8e2f811d1448a5b465c236feacd80ffbac7b']
Repo: google-bert/bert-base-uncased. Revisions ['86b5e0934494bd15c9632b12f734a8a67f723594']
Repo: amazon/chronos-t5-base. Revisions ['b6748377ca1c242cb95ed1187b8b3fe46942c023']
Repo: Salesforce/moirai-1.0-R-small. Revisions ['a34614afbe6b16fffbc11c77daba5aab3ed277fb']
Repo: nickmuchi/deberta-v3-base-finetuned-finance-text-classification. Revisions ['e07986b01cb87923b2e1622356f8093e173ee9a8']
Repo: amazon/chronos-t5-large. Revisions ['16dc70e284b7b209340c258c1375dcee93e3a768']
Repo: Salesforce/moirai-1.0-R-base. Revisions ['2149dc1c56c5d2684390ee4ec6fde58be4196c0c']
Repo: ahmedrachid/FinancialBERT-Sentiment-Analysis. Revisions ['656931965473ec085d195680bd62687b140c038f']
Repo: google/gemma-7b. Revisions ['a0eac5b80dba224e6ed79d306df50b1e92c2125d']
Repo: bardsai/finance-sentiment-fr-base. Revisions ['08571a47b6fadcd9814ea41c43e168523a1e2d64']
Repo: StephanAkkerman/FinTwitBERT-sentiment. Revisions ['da059da3b3bbcb43f9ed1aeb5ae61644010c7e1e']
Repo: Salesforce/moirai-1.0-R-large. Revisions ['2665aa4fcc9edc1402a3ad1243addfe32cd2178f']
Repo: ProsusAI/finbert. Revisions ['4556d13015211d73dccd3fdd39d39232506f3e43']
Repo: openai-community/gpt2. Revisions ['607a30d783dfa663caf39e06633721c8d4cfcd7e']
Repo: autogluon/chronos-t5-tiny. Revisions ['d968d90a73cc4e3a3103e262d1d895204e74e415']
Repo: distilbert/distilbert-base-uncased. Revisions ['12040accade4e8a0f71eabdb258fecc2e7e948be']
# use the "amazon/chronos-t5-tiny" time series model feeding in the SPY historical daily data
def use_times_series_model(self):
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from chronos import ChronosPipeline
pipeline = ChronosPipeline.from_pretrained(
"amazon/chronos-t5-tiny",
device_map="cuda",
torch_dtype=torch.bfloat16,
)
spy = self.add_equity("SPY", resolution=Resolution.DAILY)
df = self.history([spy.symbol], start=datetime(2023, 1, 1), end=datetime(2024, 1, 1))
# context must be either a 1D tensor, a list of 1D tensors,
# or a left-padded 2D tensor with batch as the first dimension
context = torch.tensor(df["close"])
prediction_length = 12
forecast = pipeline.predict(context, prediction_length)
# visualize the forecast
forecast_index = range(len(df), len(df) + prediction_length)
low, median, high = np.quantile(forecast[0].numpy(), [0.1, 0.5, 0.9], axis=0)
self.log(f'Prediction low: {low}. median: {median}. high: {high}. forecast: {str(forecast)}')
see TODO: assigning a random positive/negative value to a news, but we can do this based on the news itself
def get_object_store_model_path(self, model_name):
adjusted_model_name = model_name.replace('/', '-')
return self.object_store.get_file_path(f'llm/fine-tune/{adjusted_model_name}/')
# example of how to fine tune the "ProsusAI/finbert" model from the cache and store into the object store
def fine_tune_finbert_model(self):
### CREATE MODEL
import tensorflow as tf
from transformers import TFBertForSequenceClassification, BertTokenizer
model_name = "ProsusAI/finbert"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=3, from_pt = True)
### FETCH THE DATA
from datasets import Dataset
def fetch_tiingo_news_dataset(start_date, end_date):
aapl = self.add_equity("AAPL", Resolution.MINUTE)
dataset_symbol = self.add_data(TiingoNews, aapl.symbol).symbol
history_df = self.history(dataset_symbol, start_date, end_date, Resolution.DAILY)
history_df = history_df.reset_index()[['description']]
# TODO: Random positive/negative, can change this to be based on the news tags/description etc
history_df['label'] = np.random.randint(0,2, size=len(history_df))
# rename the description to text
history_df = history_df.rename(columns={"description": "text"})
# create the dataset from the pandas dataframe
dataset = Dataset.from_pandas(history_df)
def preprocess_function(examples):
return tokenizer(examples['text'], padding='max_length', truncation=True)
encoded_dataset = dataset.map(preprocess_function, batched=True)
return encoded_dataset
# Split dataset
train_dataset = fetch_tiingo_news_dataset(datetime(2023, 10, 1), datetime(2024, 1, 1))
eval_dataset = fetch_tiingo_news_dataset(datetime(2024, 1, 1), datetime(2024, 2, 1))
### FINE TUNE THE MODEL
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss)
tf_dataset = model.prepare_tf_dataset(train_dataset, batch_size=16, shuffle=True, tokenizer=tokenizer)
model.fit(tf_dataset, epochs=2, steps_per_epoch=115)
# we set the output directory to an object store location
output_dir = self.get_object_store_model_path(model_name)
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
# use the "ProsusAI/finbert" model from the cache
def use_original_finbert_model(self):
self.use_tf_finbert_model("ProsusAI/finbert")
# use the fine tuned "ProsusAI/finbert" model from the object store
def use_fine_tuned_finbert_model(self):
from pathlib import Path
model_dir = self.get_object_store_model_path("ProsusAI/finbert")
self.use_tf_finbert_model(Path(model_dir))
def use_tf_finbert_model(self, model_path):
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer
import tensorflow as tf
# Load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
model = TFAutoModelForSequenceClassification.from_pretrained(model_path, local_files_only=True)
# this is for debugging & comparing with other models, fine tuning
self.log(f"Using model: {str(model)}")
for layer in model.layers:
self.log(f"LAYER: {layer.weights}")
# Prepare the input sentences
sentences = ["Stocks rallied and the British pound gained."]
inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors='tf')
# Get the model outputs
outputs = model(**inputs)
# Apply softmax to the outputs to get probabilities
res = tf.nn.softmax(outputs.logits, axis=-1).numpy()
self.log(str(res))
see TODO: the model wants evenly sampled data, but equity prices aren't there on a weekend -> maybe we can fake the time series, basically just an incremental time counter
# use the "Salesforce/moirai-1.0-R-{SIZE}" time series model feeding in the SPY historical daily data
def use_times_series_model_moirai(self):
import torch
import matplotlib.pyplot as plt
import pandas as pd
from gluonts.dataset.pandas import PandasDataset
from gluonts.dataset.split import split
from uni2ts.eval_util.plot import plot_single
from uni2ts.model.moirai import MoiraiForecast, MoiraiModule
SIZE = "small" # model size: choose from {'small', 'base', 'large'}
PDT = 20 # prediction length: any positive integer
CTX = 200 # context length: any positive integer
PSZ = "auto" # patch size: choose from {"auto", 8, 16, 32, 64, 128}
BSZ = 32 # batch size: any positive integer
TEST = 100 # test set length: any positive integer
# Read data into pandas DataFrame
spy = self.add_equity("SPY", resolution=Resolution.DAILY)
df = self.history([spy.symbol], start=datetime(2023, 1, 1), end=datetime(2024, 1, 1))
adjusted_df = df.reset_index()[['time', 'close']]
adjusted_df = adjusted_df.rename(columns={'close':'target'})
adjusted_df['time'] = pd.to_datetime(adjusted_df['time'])
adjusted_df.set_index('time', inplace=True)
# TODO: the models want's the data evenly sampled
adjusted_df = adjusted_df.resample('D').asfreq()
ds = PandasDataset(adjusted_df, freq="D")
# Split into train/test set
train, test_template = split(
ds, offset=-TEST
) # assign last TEST time steps as test set
# Construct rolling window evaluation
test_data = test_template.generate_instances(
prediction_length=PDT, # number of time steps for each prediction
windows=TEST // PDT, # number of windows in rolling window evaluation
distance=PDT, # number of time steps between each window - distance=PDT for non-overlapping windows
)
# Prepare pre-trained model by downloading model weights from huggingface hub
model = MoiraiForecast(
module=MoiraiModule.from_pretrained(f"Salesforce/moirai-1.0-R-{SIZE}"),
prediction_length=PDT,
context_length=CTX,
patch_size=PSZ,
num_samples=100,
target_dim=1,
feat_dynamic_real_dim=ds.num_feat_dynamic_real,
past_feat_dynamic_real_dim=ds.num_past_feat_dynamic_real,
)
predictor = model.create_predictor(batch_size=BSZ)
forecasts = predictor.predict(test_data.input)
forecast_it = iter(forecasts)
forecast = next(forecast_it)
self.Log(str(forecast))
Example of fine tuning the amazon chronos time series model using daily data & storing it to the object store
see TODO: the model wants evenly sampled data, but equity prices aren't there on a weekend -> maybe we can fake the time series, basically just an incremental time counter. Similar to the "Salesforce/moirai-1.0-R-{SIZE}"
model
def get_object_store_model_path(self, model_name):
adjusted_model_name = model_name.replace('/', '-')
return self.object_store.get_file_path(f'llm/fine-tune/{adjusted_model_name}/')
def fine_tune_chronos_model(self):
# Read data into pandas DataFrame
spy = self.add_equity("SPY", resolution=Resolution.DAILY)
df = self.history([spy.symbol], start=datetime(2023, 1, 1), end=datetime(2024, 1, 1))
adjusted_df = df.reset_index()[['time', 'close']]
adjusted_df = adjusted_df.rename(columns={'close':'target'})
adjusted_df['time'] = pd.to_datetime(adjusted_df['time'])
adjusted_df.set_index('time', inplace=True)
# TODO: the models want's the data evenly sampled
adjusted_df = adjusted_df.resample('D').asfreq()
model_name = "amazon/chronos-t5-tiny"
model_dir = self.get_object_store_model_path(model_name)
self.train_chronos([adjusted_df],
model_id=model_name,
output_dir=model_dir,
# Requires Ampere GPUs (e.g., A100)
tf32=False,
# TODO Change me, 10 is just a quick tune
max_steps=10
)
def train_chronos(self, training_data,
probability: Optional[str] = None,
context_length: int = 512,
prediction_length: int = 64,
min_past: int = 64,
max_steps: int = 200_000,
save_steps: int = 50_000,
log_steps: int = 500,
per_device_train_batch_size: int = 32,
learning_rate: float = 1e-3,
optim: str = "adamw_torch_fused",
shuffle_buffer_length: int = 100,
gradient_accumulation_steps: int = 2,
model_id: str = "google/t5-efficient-tiny",
model_type: str = "seq2seq",
random_init: bool = False,
tie_embeddings: bool = False,
output_dir: str = "./output/",
tf32: bool = True,
torch_compile: bool = True,
tokenizer_class: str = "MeanScaleUniformBins",
tokenizer_kwargs: str = "{'low_limit': -15.0, 'high_limit': 15.0}",
n_tokens: int = 4096,
n_special_tokens: int = 2,
pad_token_id: int = 0,
eos_token_id: int = 1,
use_eos_token: bool = True,
lr_scheduler_type: str = "linear",
warmup_ratio: float = 0.0,
dataloader_num_workers: int = 1,
max_missing_prop: float = 0.9,
num_samples: int = 20,
temperature: float = 1.0,
top_k: int = 50,
top_p: float = 1.0,
seed: Optional[int] = None):
from ast import literal_eval
from pathlib import Path
from functools import partial
from typing import List, Iterator, Optional, Dict
from torch.utils.data import IterableDataset, get_worker_info
from transformers import Trainer, TrainingArguments, set_seed
from gluonts.dataset.pandas import PandasDataset
from gluonts.itertools import Filter
from chronos import ChronosConfig
# load the helper traning scripts and set the logger instance
from chronos.scripts.training.train import ChronosDataset, has_enough_observations, load_model
from chronos.scripts.training import train
from logging import getLogger, INFO
train.logger = getLogger()
train.logger.setLevel(INFO)
output_dir = Path(output_dir)
if isinstance(probability, str):
probability = literal_eval(probability)
elif probability is None:
probability = [1.0 / len(training_data)] * len(training_data)
if isinstance(tokenizer_kwargs, str):
tokenizer_kwargs = literal_eval(tokenizer_kwargs)
assert isinstance(tokenizer_kwargs, dict)
assert model_type in ["seq2seq", "causal"]
if not model_type == "seq2seq":
raise NotImplementedError("Only seq2seq models are currently supported")
if seed is None:
import random
seed = random.randint(0, 2**32)
# transformers
set_seed(seed=seed)
self.log(f"Output dir: {output_dir}. Using SEED: {seed}. Mixing probabilities: {probability}")
self.log(f"Loading and filtering {len(training_data)} datasets for training: {training_data}")
train_datasets = [Filter(
partial(
has_enough_observations,
min_length=min_past + prediction_length,
max_missing_prop=max_missing_prop,
),
PandasDataset(data_frame, freq="D"),
)
for data_frame in training_data
]
self.log("Initializing model")
model = load_model(
model_id=model_id,
model_type=model_type,
vocab_size=n_tokens,
random_init=random_init,
tie_embeddings=tie_embeddings,
pad_token_id=pad_token_id,
eos_token_id=eos_token_id,
)
chronos_config = ChronosConfig(
tokenizer_class=tokenizer_class,
tokenizer_kwargs=tokenizer_kwargs,
n_tokens=n_tokens,
n_special_tokens=n_special_tokens,
pad_token_id=pad_token_id,
eos_token_id=eos_token_id,
use_eos_token=use_eos_token,
model_type=model_type,
context_length=context_length,
prediction_length=prediction_length,
num_samples=num_samples,
temperature=temperature,
top_k=top_k,
top_p=top_p,
)
# Add extra items to model config so that it's saved in the ckpt
model.config.chronos_config = chronos_config.__dict__
shuffled_train_dataset = ChronosDataset(
datasets=train_datasets,
probabilities=probability,
tokenizer=chronos_config.create_tokenizer(),
context_length=context_length,
prediction_length=prediction_length,
min_past=min_past,
mode="training",
).shuffle(shuffle_buffer_length=shuffle_buffer_length)
# Define training args
training_args = TrainingArguments(
output_dir=str(output_dir),
per_device_train_batch_size=per_device_train_batch_size,
learning_rate=learning_rate,
lr_scheduler_type=lr_scheduler_type,
warmup_ratio=warmup_ratio,
optim=optim,
logging_dir=str(output_dir / "train-logs"),
logging_strategy="steps",
logging_steps=log_steps,
save_strategy="steps",
save_steps=save_steps,
report_to=["tensorboard"],
max_steps=max_steps,
gradient_accumulation_steps=gradient_accumulation_steps,
dataloader_num_workers=dataloader_num_workers,
tf32=tf32, # remove this if not using Ampere GPUs (e.g., A100)
torch_compile=torch_compile,
ddp_find_unused_parameters=False,
remove_unused_columns=False,
)
# Create Trainer instance
trainer = Trainer(
model=model,
args=training_args,
train_dataset=shuffled_train_dataset,
)
self.log("Training start...")
trainer.train()
self.log("Training ended!")
model.save_pretrained(output_dir)
def get_object_store_model_path(self, model_name):
adjusted_model_name = model_name.replace('/', '-')
return self.object_store.get_file_path(f'llm/fine-tune/{adjusted_model_name}/')
def use_times_series_model_chronos_from_cache(self):
self.use_times_series_model_chronos("amazon/chronos-t5-tiny")
def use_fine_tuned_chronos_model(self):
from pathlib import Path
model_dir = self.get_object_store_model_path("amazon/chronos-t5-tiny")
self.use_times_series_model_chronos(Path(model_dir))
# use the given time series model name feeding in the SPY historical daily data
def use_times_series_model_chronos(self, model_name_or_path):
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from chronos import ChronosPipeline
pipeline = ChronosPipeline.from_pretrained(
model_name_or_path,
device_map="cuda",
torch_dtype=torch.bfloat16,
)
self.log(f"Using model: {str(pipeline)}")
spy = self.add_equity("SPY", resolution=Resolution.DAILY)
df = self.history([spy.symbol], start=datetime(2023, 1, 1), end=datetime(2024, 1, 1))
# context must be either a 1D tensor, a list of 1D tensors,
# or a left-padded 2D tensor with batch as the first dimension
context = torch.tensor(df["close"])
prediction_length = 12
forecast = pipeline.predict(context, prediction_length)
# visualize the forecast
forecast_index = range(len(df), len(df) + prediction_length)
low, median, high = np.quantile(forecast[0].numpy(), [0.1, 0.5, 0.9], axis=0)
self.log(f'Prediction low: {low}. median: {median}. high: {high}. forecast: {str(forecast)}')
These models are available in live, backtesting & research in the cloud environment.
Access installed models and their revisions
Current output:
Example of how to use
ProsusAI/finbert
modelExample of how to train a model
TODO WIP: pending adding new libraries to improve usage