CassioML / cassio

A framework-agnostic Python library to seamlessly integrate Cassandra with ML/LLM/genAI workloads
Apache License 2.0
101 stars 18 forks source link

Cassio Agents Metadata schema for Langchain #46

Closed xingh closed 6 months ago

xingh commented 1 year ago

There are the following "Objects" in Langchain that can be managed by data stored elsewhere, and not necessarily "hardcoded" in code.

These represent the choices users can make

In the beginning can just be Python objects , or rather JSON configs , then we can move to tables.

AgentTypes (stores the registry of Agent Types)

LLMTypes

ToolTypes

DocumentLoaderTypes

This represents what a user can define and store

Can start with JSON and then move to Tables

LLMConfiguration

Agent

Tools

DocumentLoaderConfiguration

Index

Strategy to implement

This is just the overall spec of what goes into an agent at a high level. Recommendation is to first implement with 100% JSON driven agent -- then implement in Schema -- since it will just be an optimization of where to store the config rather than functionality. This is applicable to future agent frameworks.

karlunho-datastax commented 1 year ago

@erichare to review and communicate to the streaming team.

karlunho-datastax commented 1 year ago

Elena to prototype the Agent metadata store utility to create, read, store Agent metadata from Langchain

xingh commented 1 year ago

I think @ElenaKusevska started working on ingestion stuff related to arxiv. As I took over langchain-ui, some of this is done, currently via Prisma / NextJS / Supabase

model PromptTemplate {
  id        Int       @id @default(autoincrement())
  createdAt DateTime  @default(now())
  updatedAt DateTime  @updatedAt
  prompt    String
  user      User      @relation(fields: [userId], references: [id], onDelete: Cascade)
  userId    String
  inputs    Json
  name      String
  Chatbot   Chatbot[]
}

model Datasource {
  id        Int       @id @default(autoincrement())
  createdAt DateTime  @default(now())
  updatedAt DateTime  @updatedAt
  user      User      @relation(fields: [userId], references: [id], onDelete: Cascade)
  userId    String
  url       String
  name      String
  type      String
  Chatbot   Chatbot[]
}

model Chatbot {
  id               Int              @id @default(autoincrement())
  createdAt        DateTime         @default(now())
  updatedAt        DateTime         @updatedAt
  user             User             @relation(fields: [userId], references: [id], onDelete: Cascade)
  userId           String
  promptTemplate   PromptTemplate?  @relation(fields: [promptTemplateId], references: [id])
  promptTemplateId Int?
  datasource       Datasource?      @relation(fields: [datasourceId], references: [id])
  datasourceId     Int?
  name             String
  ChatbotMessage   ChatbotMessage[]
}

model ChatbotMessage {
  id        Int      @id @default(autoincrement())
  createdAt DateTime @default(now())
  updatedAt DateTime @updatedAt
  chatbot   Chatbot? @relation(fields: [chatbotId], references: [id], onDelete: Cascade)
  chatbotId Int?
  message   String   @db.VarChar()
  agent     String
}

The superagent.sh schema also has a good starting point.


enum DocumentType {
  TXT
  PDF
  CSV
  YOUTUBE
  OPENAPI
  URL
  MARKDOWN
  FIRESTORE
  PSYCHIC
  GITHUB_REPOSITORY
  WEBPAGE
  STRIPE
  AIRTABLE
  SITEMAP
  NOTION
}

enum ToolType {
  BROWSER
  SEARCH
  WOLFRAM_ALPHA
  REPLICATE
  ZAPIER_NLA
  AGENT
  OPENAPI
  CHATGPT_PLUGIN
  METAPHOR
}

model User {
  id          String       @id @default(cuid()) @db.VarChar(255)
  email       String       @unique @db.VarChar(255)
  password    String?      @db.VarChar(255)
  name        String?      @db.VarChar(255)
  createdAt   DateTime?    @default(now())
  updatedAt   DateTime?    @updatedAt
  deletedAt   DateTime?
  profile     Profile?
  Agent       Agent[]
  ApiToken    ApiToken[]
  Document    Document[]
  Prompt      Prompt[]
  Tool        Tool[]
  AgentTrace  AgentTrace[]
  provider    String?
  accessToken String?
  Tag         Tag[]
}

model Profile {
  id       String @id @default(cuid()) @db.VarChar(255)
  userId   String @unique @db.VarChar(255)
  user     User   @relation(fields: [userId], references: [id])
  metadata Json?  @default("{}")
}

model Document {
  id            String          @id @default(cuid()) @db.VarChar(255)
  description   String?         @db.Text()
  userId        String          @db.VarChar(255)
  user          User            @relation(fields: [userId], references: [id])
  type          DocumentType    @default(TXT)
  url           String?         @db.Text()
  content       String?         @db.Text()
  contentHash   String?         @db.VarChar(255)
  name          String
  splitter      Json?
  createdAt     DateTime?       @default(now())
  updatedAt     DateTime?       @updatedAt
  index         Json?
  authorization Json?
  metadata      Json?
  Agent         Agent[]
  AgentDocument AgentDocument[]
}

model Agent {
  id             String          @id @default(cuid()) @db.VarChar(255)
  description    String?         @db.Text()
  avatarUrl      String?         @db.Text()
  shareableToken String?         @db.Text()
  userId         String          @db.VarChar(255)
  user           User            @relation(fields: [userId], references: [id])
  document       Document?       @relation(fields: [documentId], references: [id])
  documentId     String?         @db.VarChar(255)
  tool           Tool?           @relation(fields: [toolId], references: [id])
  toolId         String?         @db.VarChar(255)
  tags           Json?           @default("[]")
  prompt         Prompt?         @relation(fields: [promptId], references: [id])
  promptId       String?         @db.VarChar(255)
  name           String
  type           AgentType       @default(REACT)
  llm            Json            @default("{ \"provider\": \"openai-chat\", \"model\": \"gpt-3.5-turbo\" }")
  hasMemory      Boolean         @default(false)
  isPublic       Boolean         @default(false)
  isListed       Boolean         @default(false)
  AgentMemory    AgentMemory[]
  createdAt      DateTime?       @default(now())
  updatedAt      DateTime?       @updatedAt
  AgentTrace     AgentTrace[]
  AgentDocument  AgentDocument[]
  AgentTool      AgentTool[]
}

model ApiToken {
  id          String @id @default(cuid()) @db.VarChar(255)
  userId      String @db.VarChar(255)
  user        User   @relation(fields: [userId], references: [id])
  description String @db.VarChar(255)
  token       String
}

model AgentMemory {
  id        String                @id @default(cuid()) @db.VarChar(255)
  agentId   String                @db.VarChar(255)
  agent     Agent                 @relation(fields: [agentId], references: [id], onDelete: Cascade)
  author    AgentMemoryAuthorType @default(HUMAN)
  message   String                @db.Text()
  session   String?
  createdAt DateTime?             @default(now())
  updatedAt DateTime?             @updatedAt
  deletedAt DateTime?
}

model AgentTrace {
  id        String    @id @default(cuid()) @db.VarChar(255)
  userId    String    @db.VarChar(255)
  user      User      @relation(fields: [userId], references: [id])
  agentId   String    @db.VarChar(255)
  data      Json
  agent     Agent     @relation(fields: [agentId], references: [id], onDelete: Cascade)
  createdAt DateTime? @default(now())
  updatedAt DateTime? @updatedAt
}

model Prompt {
  id              String    @id @default(cuid()) @db.VarChar(255)
  name            String
  template        String    @db.Text()
  input_variables Json
  userId          String    @db.VarChar(255)
  user            User      @relation(fields: [userId], references: [id])
  createdAt       DateTime? @default(now())
  updatedAt       DateTime? @updatedAt
  deletedAt       DateTime?
  Agent           Agent[]
}

model AgentDocument {
  id         String    @id @default(cuid()) @db.VarChar(255)
  document   Document? @relation(fields: [documentId], references: [id])
  documentId String?   @db.VarChar(255)
  agentId    String    @db.VarChar(255)
  agent      Agent     @relation(fields: [agentId], references: [id], onDelete: Cascade)
  createdAt  DateTime? @default(now())
  updatedAt  DateTime? @updatedAt
  deletedAt  DateTime?
}

model Tag {
  id        String    @id @default(cuid()) @db.VarChar(255)
  name      String
  color     String?   @default("#0e8a16")
  userId    String    @db.VarChar(255)
  user      User      @relation(fields: [userId], references: [id])
  createdAt DateTime? @default(now())
  updatedAt DateTime? @updatedAt
}

model Tool {
  id          String      @id @default(cuid()) @db.VarChar(255)
  name        String
  description String?     @db.Text()
  type        ToolType?
  metadata    Json?
  userId      String      @db.VarChar(255)
  user        User        @relation(fields: [userId], references: [id])
  createdAt   DateTime?   @default(now())
  updatedAt   DateTime?   @updatedAt
  Agent       Agent[]
  AgentTool   AgentTool[]
}

model AgentTool {
  id        String    @id @default(cuid()) @db.VarChar(255)
  tool      Tool?     @relation(fields: [toolId], references: [id])
  toolId    String?   @db.VarChar(255)
  agentId   String    @db.VarChar(255)
  agent     Agent     @relation(fields: [agentId], references: [id], onDelete: Cascade)
  createdAt DateTime? @default(now())
  updatedAt DateTime? @updatedAt
  deletedAt DateTime?
}

Next suggested step, decide on what to do with the API since some of this will overlap for data exploration (to be exposed via UI), for data retrieval in general so people can use it in JS, and the finally langchain agent execution via metadata.