quirrel-dev / quirrel

The Task Queueing Solution for Serverless.
https://quirrel.dev
MIT License
885 stars 67 forks source link

[Sveltekit] Quirrel job fails on production #1154

Closed lnfel closed 1 year ago

lnfel commented 1 year ago

Bug Report

Quirrel works well on development environment but fails on production (job never fires). The quirrel instance is hosted on railway following the guide on https://docs.quirrel.dev/deployment/railway and checking the deploy logs gave me the following:

{"level":10,"time":1688093351978,"pid":8,"hostname":"railway","module":"owl","msg":"Checking for jobs"}
{"level":10,"time":1688093351978,"pid":8,"hostname":"railway","module":"owl","msg":"Peeking into queue"}
{"level":10,"time":1688093351979,"pid":8,"hostname":"railway","module":"owl","result":["empty"],"msg":"Checking for jobs finished"}
{"level":30,"time":1688093352793,"pid":8,"hostname":"railway","reqId":"req-4f","req":{"method":"POST","url":"/queues/https%3A%2F%2Fmy-domain%2Fquirrel","hostname":"quirrel-production-56fc.up.railway.app","remoteAddress":"192.168.0.2","remotePort":42976},"msg":"incoming request"}
{"level":20,"time":1688093352795,"pid":8,"hostname":"railway","module":"owl","job":{"queue":"aerial_token;https%3A%2F%2Fmy-domain%2Fquirrel","id":"f66096fc-5c70-44eb-a8cf-3f8dbc97d735","payload":"cedc:encoded-data"},"msg":"Producer: Enqueueing"}
{"level":20,"time":1688093352796,"pid":8,"hostname":"railway","module":"owl","job":{"queue":"aerial_token;https%3A%2F%2Fmy-domain%2Fquirrel","id":"f66096fc-5c70-44eb-a8cf-3f8dbc97d735","payload":"cedc:encoded-data","runAt":"2023-06-30T02:49:12.795Z"},"msg":"Producer: Enqueued"}
{"level":30,"time":1688093352796,"pid":8,"hostname":"railway","job":{"id":"f66096fc-5c70-44eb-a8cf-3f8dbc97d735","endpoint":"https://my-domain/quirrel","body":"cedc:encoded-data","runAt":"2023-06-30T02:49:12.795Z","exclusive":false,"retry";:[],"count":1,"tokenId":"aerial_token"},"msg":"Created job."}
{"level":30,"time":1688093352797,"pid":8,"hostname":"railway","reqId":"req-4f","res":{"statusCode":201},"responseTime":3.6115440130233765,"msg":"request completed"}
{"level":10,"time":1688093352799,"pid":8,"hostname":"railway","module":"owl","result":["success",["aerial_token;https%3A%2F%2Fmy-domain%2Fquirrel","f66096fc-5c70-44eb-a8cf-3f8dbc97d735","cedc:encoded-data",1688093352795,"","","1","","false","[]"]],"msg":"Checking for jobs finished"}
{"level":10,"time":1688093352799,"pid":8,"hostname":"railway","module":"owl","job":["aerial_token;https%3A%2F%2Fmy-domain%2Fquirrel","f66096fc-5c70-44eb-a8cf-3f8dbc97d735","cedc:encoded-data",1688093352795,"","","1","","false","[]"],"msg":"Distributor: Starting work on job"}
{"level":10,"time":1688093352799,"pid":8,"hostname":"railway","module":"owl","job":{"queue":"aerial_token;https:%2F%2Fmy-domain%2Fquirrel","id":"f66096fc-5c70-44eb-a8cf-3f8dbc97d735","payload":"cedc:encoded-data","runAt":"2023-06-30T02:49:12.795Z","count":1,"exclusive":false,"retry":[]},"ackDescriptor":{"jobId":"f66096fc-5c70-44eb-a8cf-3f8dbc97d735","queueId":"aerial_token;https:%2F%2Fmy-domain%2Fquirrel"},"msg":"Worker: Starting execution"}
{"level":30,"time":1688093352799,"pid":8,"hostname":"railway","correlationId":"af44bda6-a4c8-4033-b837-057bbf29db22","job":{"tokenId":"aerial_token","endpoint":"https://my-domain/quirrel","body":"cedc:encoded-data","id":"f66096fc-5c70-44eb-a8cf-3f8dbc97d735";},"msg":"Started execution of job."}
{"level":10,"time":1688093353251,"pid":8,"hostname":"railway","module":"owl","job":{"queue":"aerial_token;https:%2F%2Fmy-domain%2Fquirrel","id":"f66096fc-5c70-44eb-a8cf-3f8dbc97d735","payload":"cedc:encoded-data","runAt":"2023-06-30T02:49:12.795Z","count":1,"exclusive":false,"retry":[]},"descriptor":{"jobId":"f66096fc-5c70-44eb-a8cf-3f8dbc97d735","queueId":"aerial_token;https:%2F%2Fmy-domain%2Fquirrel"},"options":{"dontReschedule":false},"msg":"Acknowledger: Starting to report failure"}
{"level":10,"time":1688093353252,"pid":8,"hostname":"railway","module":"owl","descriptor":{"jobId":"f66096fc-5c70-44eb-a8cf-3f8dbc97d735","queueId":"aerial_token;https:%2F%2Fmy-domain%2Fquirrel"},"msg":"Acknowledger: Job will be reported as failure."}
{"level":10,"time":1688093353252,"pid":8,"hostname":"railway","module":"owl","descriptor":{"jobId":"f66096fc-5c70-44eb-a8cf-3f8dbc97d735","queueId":"aerial_token;https:%2F%2Fmy-domain%2Fquirrel"},"msg":"Acknowledger: Job isn't retryable, so it'll be accounted as an error."}
{"level":10,"time":1688093353256,"pid":8,"hostname":"railway","module":"owl","job":{"queue":"aerial_token;https:%2F%2Fmy-domain%2Fquirrel","id":"f66096fc-5c70-44eb-a8cf-3f8dbc97d735","payload":"cedc:encoded-data","runAt":"2023-06-30T02:49:12.795Z","count":1,"exclusive":false,"retry":[]},"ackDescriptor":{"jobId":"f66096fc-5c70-44eb-a8cf-3f8dbc97d735","queueId":"aerial_token;https:%2F%2Fmy-domain%2Fquirrel"},"msg":"Worker: Finished execution"}
{"level":10,"time":1688093353256,"pid":8,"hostname":"railway","module":"owl","job":["aerial_token;https%3A%2F%2Fmy-domain%2Fquirrel","f66096fc-5c70-44eb-a8cf-3f8dbc97d735","cedc:encoded-data",1688093352795,"","","1","","false","[]"],"msg":"Distributor: Finished work on job"}
{"level":10,"time":1688093353256,"pid":8,"hostname":"railway","module":"owl","msg":"Checking for jobs"}
{"level":10,"time":1688093353256,"pid":8,"hostname":"railway","module":"owl","msg":"Peeking into queue"}
{"level":10,"time":1688093353257,"pid":8,"hostname":"railway","module":"owl","result":["empty"],"msg":"Checking for jobs finished"}
{"level":10,"time":1688093353905,"pid":8,"hostname":"railway","module":"owl","msg":"Stale-Checker: Starting."}
{"level":10,"time":1688093353905,"pid":8,"hostname":"railway","module":"owl","msg":"Stale-Checker: No stale jobs found."}

I am not really sure what the log is trying to say but it seemed quirrel does received the request but was not able to run the job?

This is how I use quirrel:

// First export Quirrel as route endpoint in /quirrel +server.js file

/**
 * SvelteKit build crash with Quirrel import
 * https://github.com/quirrel-dev/quirrel/issues/1111#issuecomment-1365873921
 */
import { Queue } from 'quirrel/sveltekit.cjs'

const queue = Queue(
  'quirrel',
  async (job, meta) => {
    console.log('Running quirrel job') // This has never been run on production

    // additional work here that creates new data to database
  }
)

// important part
export const POST = queue

/**
 * I've seen this part on https://docs.quirrel.dev/api/sveltekit but sveltekit has long since disallows doing this
 * so I had it commented out
 */
// export default queue
// Then sveltekit form action in /queue endpoint inside +page.server.js file to enqueue job

import prisma from '$lib/prisma.js'
import { POST as quirrel } from '../quirrel/+server.js'

export const actions = {
  default: async ({ request, locals }) => {
    const model = await prisma.table.create({ data: <some-data-here> })

    await quirrel.enqueue({ modelId: model.id, locals })
  }
}

The code above is just a simple job that adds new data to database, not sure why it does not work in production.

Quirrel env variables are set like this inside the project's .env file and also set this as shared variables on the railway quirrel instance:

QUIRREL_TOKEN=aeda6e05...
QUIRREL_BASE_URL=https://my-domain.com
QUIRREL_ENCRYPTION_SECRET=2ec99...
QUIRREL_API_URL=https://quirrel-production-56fc.up.railway.app

Environment

Additional notes: Also tried to simulate production environment locally using localhost:9181 as quirrel instance and app at localhost:3000, same thing also happens (job never fires) but I can't find logs of quirrel so it was really hard to debug locally.

lnfel commented 1 year ago

So I managed to find the piece of code related to error logging, it was in owl repo: https://github.com/quirrel-dev/owl/blob/f25b3952705c7e9bf0bd6b50f183b0cc366cb1c7/src/shared/acknowledger.ts#L90-L93

pipeline.publish(event, `${_queueId}:${_jobId}:${errorString}`);
pipeline.publish(_queueId, `${event}:${_jobId}:${errorString}`);
pipeline.publish(`${_queueId}:${_jobId}`, `${event}:${errorString}`);
pipeline.publish(`${_queueId}:${_jobId}:${event}`, errorString);

Now the question is where and when they are published? I don't see it on the logs or any relevant error message that can put me in the right direction on fixing the issue.

lnfel commented 1 year ago

Seems I figured out something from the logs, particularly the following:

{"level":30,"time":1688093352796,"pid":8,"hostname":"railway","job":{"id":"f66096fc-5c70-44eb-a8cf-3f8dbc97d735","endpoint":"https://my-domain/quirrel","body":"cedc:encoded-data","runAt":"2023-06-30T02:49:12.795Z","exclusive":false,"retry";:[],"count":1,"tokenId":"aerial_token"},"msg":"Created job."}
{"level":30,"time":1688093352799,"pid":8,"hostname":"railway","correlationId":"af44bda6-a4c8-4033-b837-057bbf29db22","job":{"tokenId":"aerial_token","endpoint":"https://my-domain/quirrel","body":"cedc:encoded-data","id":"f66096fc-5c70-44eb-a8cf-3f8dbc97d735";},"msg":"Started execution of job."}

Quirrel is trying to send a POST request to /quirrel endpoint on the sveltekit app. I tried to simulate what it does using curl:

curl -include -X POST "https://my-domain/quirrel" -H "Accept: application/json" -d "cedc:encoded-data"

Received the following message:

{"message":"Cross-site POST form submissions are forbidden"}

Sveltekit has enabled csrf protection by default. This is the main one of the reason why quirrel was not able to send post request to sveltekit app in production. In development environment, quirrel's host is set to be at localhost:9181, the sveltekit app is at localhost:5173 and this makes both quirrel and sveltekit app to be at same ORIGIN (localhost). On production the app would be something different i.e. https://mydomain.com (origin is mydomain) and quirrel hosted on railway would be https://quirrel-production-56fc.up.railway.app (origin is railway).

Disabling the CSRF protection first came into mind although it is not recommended. Disabled it so I could test things out, ran the build and started simulating production locally but it is still not working. It seems sending a POST request from seeing the logs but in fact it might not be able to send anything at all? Sending the POST through curl request does trigger the job tho.

The updated curl request is now like this:

# This will only work for development since quirrel requires `X-Quirrel-Signature` header in production
curl -include -X POST "http://localhost:3000/quirrel" -H "Accept: application/json" -H "Origin: http://localhost:3000" -d "cedc:encoded-data"

Now I need to figure out why quirrel is not sending the request...

Skn0tt commented 1 year ago

Hi @lnfel! Good work digging through the logs, it looks like you're on track to figure it out. As a workaround, disabling CSRF protection sounds good.

This is where Quirrel makes the HTTP request, and it doesn't specify an Origin header:

https://github.com/quirrel-dev/quirrel/blob/72ffb07d7d27dd28c9d338fa781f59fa0fdb1c33/src/api/worker/index.ts#L124-L129

So it looks like maybe we need to add it there. Do you want to open a PR for that? You should be able to get the value for Origin with something like new URL(endpoint).hostname.

lnfel commented 1 year ago

Really appreciate the input @Skn0tt. I'm working on it now. Added the whole endpoint as the Origin folowing the <scheme>://<hostname> syntax:

// src/api/worker/index.ts

let { tokenId, endpoint } = decodeQueueDescriptor(job.queue);

// omitted code...

const headers: Record<string, string> = {
    "Content-Type": "text/plain",
    "x-quirrel-meta": JSON.stringify({
      id: job.id,
      count: job.count,
      exclusive: job.exclusive,
      retry: job.retry,
      nextRepetition: ack.nextExecutionDate,
    }),
    "Origin": endpoint, // The change
};

Tried building Quirrel npm run build and started an instance on localhost:9181 npm run start. I also ran my my app on localhost:3000 with the following env:

QUIRREL_TOKEN=aeda6e05-...
QUIRREL_BASE_URL=http://localhost:3000
QUIRREL_API_URL=http://localhost:9181

Then fired off quirrel enqueue on my app. Same thing happened, it logged Created Job and Executing Job but the job was not done on the sveltekit app.

Any tips on how to test this locally?

From what I read on the code the fetch should work but for some reason it is not doing anything. I added some additional logging method on dx-logger.ts/logger.ts to figure what is going on:

// dx-logger.ts
startingExecution(...): () => void { ... }

test(response: any): void { console.log(chalk`Response: ${response}`) }
Skn0tt commented 1 year ago

I'm not sure what's the best way to debug this. Maybe you could console.log the parameters passed to fetch, and try replicating that with cURL to test?

You mentioned that Origin needs to be <scheme>://<hostname>. I'm pretty sure that endpoint also contains a path which you'd need to remove in before. Could you try new URL(endpoint).origin? That seemed to work fine for me locally:

> new URL("https://test.com/test").origin
"https://test.com"
lnfel commented 1 year ago

I have written a sveltekit handle to allow CORS from post requests for /api and /quirrel endpoints, this makes the changes made to Quirrel header ORIGIN unnecessary as the app will allow CORS POST request to pass:

// src/hooks.server.js

import { sequence } from '@sveltejs/kit/hooks'

async function svelteHandleCors({ event, resolve }) {
  // Apply CORS header for specified route endpoints
  console.log("API route: ", event.url.pathname.startsWith('/api'))
  console.log("Quirrel route: ", event.url.pathname.startsWith('/quirrel'))

  if (event.url.pathname.startsWith('/api') || event.url.pathname.startsWith('/quirrel')) {
    // Required for CORS to work
    console.log("Request method: ", event.request.method)
    if (event.request.method === 'OPTIONS') {
      return new Response(null, {
        headers: {
          'Access-Control-Allow-Methods': 'GET, POST, PUT, DELETE, PATCH, OPTIONS',
          'Access-Control-Allow-Origin': '*',
          'Access-Control-Allow-Headers': '*',
        }
      })
    }
  }

  const response = await resolve(event)
  if (event.url.pathname.startsWith('/api') || event.url.pathname.startsWith('/quirrel')) {
    response.headers.append('Access-Control-Allow-Origin', '*')
  }

  return response
}

export const handle = sequence(
  svelteHandleCors,
)

Did a lot of testing, so far here are the results:

✅ Sveltekit running on vite development server at port 5173:

👟Executing job queue: /quirrel id: 8433885c-e10a-44eb-8da6-02921bd8becb body: ...omitted

Token: null Endpoint: http://localhost:5173/quirrel Headers: {"Content-Type":"text/plain","x-quirrel-meta":"{\"id\":\"8433885c-e10a-44eb-8da6-02921bd8becb\",\"count\":1,\"exclusive\":false,\"retry\":[]}","Origin":"http://localhost:5173"} Response status: 200 Success: true

✔️ Successfully executed 8433885c-e10a-44eb-8da6-02921bd8becb.

### ❌  Sveltekit built with the following env config running using node at port 3000:
```env
QUIRREL_TOKEN=aeda6e05-...
QUIRREL_BASE_URL=http://localhost:3000
QUIRREL_API_URL=http://localhost:9181

Command used to run the node server after building the app:

ORIGIN=http://localhost:3000 BODY_SIZE_LIMIT=0 node build

How do we know the request was never sent? I've added a logger after Promise.all . response was never logged as well as any other logs inside the if else statement. The log looks like the following:

📝Created Job
  queue: /quirrel
  id: 8433885c-e10a-44eb-8da6-02921bd8becb
  body: ...omitted

👟Executing job
  queue: /quirrel
  id: 8433885c-e10a-44eb-8da6-02921bd8becb
  body: ...omitted

Token: null
Endpoint: http://localhost:5173/quirrel
Headers: {"Content-Type":"text/plain","x-quirrel-meta":"{\"id\":\"8433885c-e10a-44eb-8da6-02921bd8becb\",\"count\":1,\"exclusive\":false,\"retry\":[]}","Origin":"http://localhost:5173"}

❗Sveltekit built with the following env config running using vite preview at port 4173:

QUIRREL_TOKEN=aeda6e05-...
QUIRREL_BASE_URL=http://localhost:4173
QUIRREL_API_URL=http://localhost:9181

Command used to run vite preview after building the app:

npx vite preview --open

👟Executing job queue: /quirrel id: 8433885c-e10a-44eb-8da6-02921bd8becb body: ...omitted

Token: null Endpoint: http://localhost:5173/quirrel Headers: {"Content-Type":"text/plain","x-quirrel-meta":"{\"id\":\"8433885c-e10a-44eb-8da6-02921bd8becb\",\"count\":1,\"exclusive\":false,\"retry\":[]}","Origin":"http://localhost:4173"} Response status: 401 Success: false Response text: Signature missing



From what I see in the code, there is this `QUIRREL_SIGNATURE_PUBLIC_KEY` env variable that can be set. How do I get this? Is this also what is needed in production? Apparently there is no mention of this in the docs.

My conclusion is that Quirrel is working when vite server is used but it breaks when the app is run using node server. This is very tricky as there is no error or log that shows what is wrong. Running `vite preview` uses the files located at `.svelte-kit`, and I think the files in `build` folder is different hence I guess is the issue.
Skn0tt commented 1 year ago

QUIRREL_SIGNATURE_PUBLIC_KEY is unrelated to any of this, and it's not documented because it's only useful for large-scale, multi-tenant deployments.

I also don't think CORS plays a role here, since CORS is only enforced by the user-agent. Quirrel doesn't enforce CORS.

Out of interest, what happens when you disable CSRF protection in your Svelte App? Also, have you tried my recommendation from https://github.com/quirrel-dev/quirrel/issues/1154#issuecomment-1621331434? What was the result?

lnfel commented 1 year ago

I did tried it:

const headers: Record<string, string> = {
  "Content-Type": "text/plain",
  "x-quirrel-meta": JSON.stringify({
    id: job.id,
    count: job.count,
    exclusive: job.exclusive,
    retry: job.retry,
    nextRepetition: ack.nextExecutionDate,
  }),
  "Origin": new URL(endpoint).origin,
};

It shows on the logs too:

// src/api/worker/index.ts
logger?.test(`Headers: ${JSON.stringify(headers)}`);

// terminal
> Headers: {"Content-Type":"text/plain","x-quirrel-meta":"{\"id\":\"b24c5f93-a3c4-463c-8b44-3cc076b2631c\",\"count\":1,\"exclusive\":false,\"retry\":[]}","Origin":"http://localhost:3000"}

I also don't think CORS plays a role here, since CORS is only enforced by the user-agent. Quirrel doesn't enforce CORS.

Yes I think so too, that is why I handled the CORS on the sveltekit app instead. Disabling CORS on the app allows POST requests without same ORIGIN.

// svelte.config.js

import adapterNode from '@sveltejs/adapter-node'
import { vitePreprocess } from '@sveltejs/kit/vite'

const config = {
  kit: {
    adapter: adapterNode(),
    csrf: {
      checkOrigin: false // disable ORIGIN check to enable CORS
    }
  },
  preprocess: vitePreprocess()
}

export default config;

Here are the curl request I made:

# Server
# ORIGIN=http://localhost:3000 BODY_SIZE_LIMIT=0 node build
# Also note that `process.env.NODE_ENV` is undefined and the results are the same if I set it to development

curl -include -X POST "localhost:3000/quirrel" \
-H "Content-Type: text/plain" \
-H "x-quirrel-meta: {\"id\":\"4e6c1dcc-a13f-410b-8016-4d9fad6232c8\",\"count\":1,\"exclusive\":false,\"retry\":[]}" \
-d "{\"artifactCollectionId\":\"artc_WUkZO6\",\"locals\":{}}"

HTTP/1.1 200 OK
access-control-allow-origin: *
content-type: text/plain;charset=UTF-8
Date: Fri, 07 Jul 2023 02:13:59 GMT
Connection: keep-alive
Keep-Alive: timeout=5
Transfer-Encoding: chunked

OK

# Server
# NODE_ENV=production ORIGIN=http://localhost:3000 BODY_SIZE_LIMIT=0 node build
# X-Quirrel-Signature is required for production environment, Quirrel should have it sent but this is not the issue

curl -include -X POST "localhost:3000/quirrel" \
-H "Content-Type: text/plain" \
-H "x-quirrel-meta: {\"id\":\"4e6c1dcc-a13f-410b-8016-4d9fad6232c8\",\"count\":1,\"exclusive\":false,\"retry\":[]}" \
-d "{\"artifactCollectionId\":\"artc_0CFUsr\",\"locals\":{}}"

HTTP/1.1 401 Unauthorized
access-control-allow-origin: *
content-type: text/plain;charset=UTF-8
Date: Fri, 07 Jul 2023 02:19:33 GMT
Connection: keep-alive
Keep-Alive: timeout=5
Transfer-Encoding: chunked

Signature missing

Only one thing I can't get my head around with is when I run the app using node, the fetch code part is not fired by Quirrel. Talking about this part here:

https://github.com/quirrel-dev/quirrel/blob/154e1a131f648f0f3d3b2fd87f0254fc9766db2d/src/api/worker/index.ts#L123-L131

Issue might be unrelated to Quirrel but with how sveltekit builds the endpoint during build time (but curl works, so quirrel fetch should also work).

Skn0tt commented 1 year ago

I'm having trouble following. For me to help, please share a minimal reproduction repository, with steps on how to reproduce the behaviour you're seeing and what you'd expect to happen instead.

lnfel commented 1 year ago

@Skn0tt I have good news, while every effort I did on simulating production environment on my local machine was hopeless, I did managed to make the production instance of sveltekit app work with Quirrel hosted on railway. I should have tested this on production sooner 😅. I am sure setting origin check fixed the issue on production:

// svelte.config.js

import adapterNode from '@sveltejs/adapter-node'
import { vitePreprocess } from '@sveltejs/kit/vite'

const config = {
  kit: {
    adapter: adapterNode(),
    csrf: {
      checkOrigin: false // disable ORIGIN check to enable CORS
    }
  },
  preprocess: vitePreprocess()
}

export default config;

Thank you so much for guiding me!