otto8-ai / otto8

Open source AI Agent Platform
Apache License 2.0
9 stars 9 forks source link

knowldege - Loading failure relating to `failed to stat files in workspace ID` #540

Open sangee2004 opened 4 days ago

sangee2004 commented 4 days ago

Note this error may not be seen always Steps to reproduce the problem:

  1. Create an agent with knowledge from onedrive location - Corps Docs (600+ files)
  2. Have New Files Ingestion Policy set to Automatic

Ingestion reports failure for one of the fails:

Screenshot 2024-11-11 at 2 32 30 PM

Agent - https://test.otto8.ai/admin/agents/a147997 Debug logs for the loading run -

{
  "frames": {
    "1731363839": {
      "chatResponseCached": false,
      "currentAgent": {

      },
      "displayText": "Running Knowledge Loading from /otto8-tools/./knowledge/load.gpt",
      "end": "0001-01-01T00:00:00Z",
      "id": "1731363839",
      "input": "{\"input\":\"/Corp Docs/Accounting/Accounts Payable/Pinnacle Branding/Invoice26648-1.pdf\",\"output\":\".conversion/Corp Docs/Accounting/Accounts Payable/Pinnacle Branding/Invoice26648-1.pdf.json\"}",
      "inputContext": null,
      "llmRequest": {
        "command": [
          "/bin/sh",
          "-c",
          "exec ${GPTSCRIPT_TOOL_DIR}/bin/gptscript-go-tool load --flows-file=blueprint:otto --flow=ottoload \"ws://${INPUT}\" \"ws://${OUTPUT}\""
        ],
        "input": "{\"input\":\"/Corp Docs/Accounting/Accounts Payable/Pinnacle Branding/Invoice26648-1.pdf\",\"output\":\".conversion/Corp Docs/Accounting/Accounts Payable/Pinnacle Branding/Invoice26648-1.pdf.json\"}"
      },
      "llmResponse": null,
      "output": [
        {
          "content": "warning: ignoring CMap range (50-50) that is outside of the codespace\nwarning: ignoring CMap range (54-54) that is outside of the codespace\nwarning: ignoring CMap range (52-52) that is outside of the codespace\nwarning: ignoring CMap range (56-56) that is outside of the codespace\nwarning: ignoring CMap range (45-45) that is outside of the codespace\nwarning: ignoring CMap range (49-49) that is outside of the codespace\nwarning: ignoring CMap range (32-32) that is outside of the codespace\nwarning: ignoring CMap range (48-48) that is outside of the codespace\nwarning: ignoring CMap range (53-53) that is outside of the codespace\nwarning: ignoring CMap range (47-47) that is outside of the codespace\nwarning: ignoring CMap range (55-55) that is outside of the codespace\nwarning: ignoring CMap range (57-57) that is outside of the codespace\nwarning: ignoring CMap range (51-51) that is outside of the codespace\nwarning: ignoring CMap range (36-36) that is outside of the codespace\nwarning: ignoring CMap range (46-46) that is outside of the codespace\nwarning: ignoring CMap range (58-58) that is outside of the codespace\nwarning: ignoring CMap range (44-44) that is outside of the codespace\nwarning: ignoring CMap range (40-40) that is outside of the codespace\nwarning: ignoring CMap range (41-41) that is outside of the codespace\nwarning: ignoring CMap range (78-78) that is outside of the codespace\nwarning: ignoring CMap range (98-98) that is outside of the codespace\nwarning: ignoring CMap range (101-101) that is outside of the codespace\nwarning: ignoring CMap range (68-68) that is outside of the codespace\nwarning: ignoring CMap range (97-97) that is outside of the codespace\nwarning: ignoring CMap range (73-73) that is outside of the codespace\nwarning: ignoring CMap range (86-86) that is outside of the codespace\nwarning: ignoring CMap range (79-79) that is outside of the codespace\nwarning: ignoring CMap range (67-67) that is outside of the codespace\nwarning: ignoring CMap range (69-69) that is outside of the codespace\nwarning: ignoring CMap range (82-82) that is outside of the codespace\nwarning: ignoring CMap range (32-32) that is outside of the codespace\nwarning: ignoring CMap range (84-84) that is outside of the codespace\nwarning: ignoring CMap range (83-83) that is outside of the codespace\nwarning: ignoring CMap range (72-72) that is outside of the codespace\nwarning: ignoring CMap range (80-80) that is outside of the codespace\nwarning: ignoring CMap range (70-70) that is outside of the codespace\nwarning: ignoring CMap range (66-66) that is outside of the codespace\nwarning: ignoring CMap range (100-100) that is outside of the codespace\nwarning: ignoring CMap range (46-46) that is outside of the codespace\nwarning: ignoring CMap range (99-99) that is outside of the codespace\nwarning: ignoring CMap range (85-85) that is outside of the codespace\nwarning: ignoring CMap range (81-81) that is outside of the codespace\nwarning: ignoring CMap range (38-38) that is outside of the codespace\nwarning: ignoring CMap range (45-45) that is outside of the codespace\nwarning: ignoring CMap range (65-65) that is outside of the codespace\nwarning: ignoring CMap range (76-76) that is outside of the codespace\n",
          "subCalls": null
        }
      ],
      "start": "2024-11-11T22:31:44.451298646Z",
      "tool": {
        "arguments": {
          "properties": {
            "Input": {
              "description": "Input File",
              "type": "string"
            },
            "Output": {
              "description": "Output File",
              "type": "string"
            },
            "know_load_metadata": {
              "description": "Comma-delimited key=value pairs to be added to the metadata of the loaded document.",
              "type": "string"
            }
          },
          "type": "object"
        },
        "description": "Load a document, convert it to Knowledge JSON Format and store it to a file.",
        "id": "/otto8-tools/./knowledge/load.gpt:Knowledge Loading",
        "instructions": "#!${GPTSCRIPT_TOOL_DIR}/bin/gptscript-go-tool load --flows-file=blueprint:otto --flow=ottoload \"ws://${INPUT}\" \"ws://${OUTPUT}\"",
        "internalPrompt": null,
        "localTools": {
          "knowledge loading": "/otto8-tools/./knowledge/load.gpt:Knowledge Loading"
        },
        "modelName": "gpt-4o",
        "name": "Knowledge Loading",
        "source": {
          "lineNo": 1,
          "location": "/otto8-tools/./knowledge/load.gpt"
        },
        "workingDir": "/otto8-tools/knowledge"
      },
      "toolResults": 0,
      "type": "callProgress",
      "usage": {

      }
    }
  },
  "spec": {
    "synchronous": true,
    "threadName": "t1-ks1fvwtj",
    "input": "{\"input\":\"/Corp Docs/Accounting/Accounts Payable/Pinnacle Branding/Invoice26648-1.pdf\",\"output\":\".conversion/Corp Docs/Accounting/Accounts Payable/Pinnacle Branding/Invoice26648-1.pdf.json\"}",
    "tool": "\"knowledge-load\""
  },
  "status": {
    "state": "error",
    "output": "",
    "endTime": "2024-11-11T22:31:44Z",
    "error": "run encountered an error: failed to read events: context canceled with error output: "
  }
}
sangee2004 commented 3 days ago

This issue was also seen when ingesting acorn.io - https://test.otto8.ai/admin/agents/a18bbqq for file - https://www.acorn.io/resources/learning-center

This is the error message:

failed to stat files in workspace ID s3://test-otto8-workspaces/39e75688-f636-416f-89d9-e7565106548f, error: not found: s3://test-otto8-workspaces/39e75688-f636-416f-89d9-e7565106548f/.conversion/www.acorn.io/resources/learning-center.md.json

Agent - https://test.otto8.ai/admin/agents/a18bbqq

iwilltry42 commented 2 days ago

Unfortunately I cannot find the run to get the full output. I could find runs related to the same file, but none with this error. This is the only occurrence where we stat files in the workspace in knowledge - it would return a differently formatted error though: https://github.com/otto8-ai/tools/blob/4aa873eeebfd67fc0d797b2b9ac65771ae17bd52/knowledge/pkg/client/standalone.go#L97

I assume this is coming from somewhere else, so I'll dig further.

Update: It's actually coming from otto8 core: https://github.com/otto8-ai/otto8/blob/9b8eb313224838fe03222c3378825f09db697fca/pkg/controller/handlers/knowledgefile/knowledgefile.go#L224

iwilltry42 commented 15 hours ago

Donnie just put in a change - please retest :)

sangee2004 commented 12 hours ago

Tested with version

"github.com/otto8-ai/tools": "c47df03a1857be27eb23512acbe48b29368ba555",
  "otto": "v0.0.0-dev+fdad35a6"

Able to reproduce the issue.

Ingested knowledge files fromhttps://nginx.org/en/docs

Ingestion of few files reported error. Image

Few of the file ingestions failed with error -failed to stat files in workspace ID s3://test-otto8-workspaces/c075c962-cfa0-4eb0-9ef2-f2af5ea829de, error: not found: s3://test-otto8-workspaces/c075c962-cfa0-4eb0-9ef2-f2af5ea829de/.conversion/nginx.org/en/docs/stream/ngx_stream_access_module.html.md.json

Agent that encountered this error- https://test.otto8.ai/admin/agents/a1x5b6w

Debug log for runs with this failure - https://test.otto8.ai/api/runs/r1zc7j4/debug https://test.otto8.ai/api/runs/r1fkz7n/debug https://test.otto8.ai/api/runs/r147qdz/debug https://test.otto8.ai/api/runs/r1fjv2q/debug

Debug logs for one of them:


{
  "frames": {
    "1731701894": {
      "chatResponseCached": false,
      "currentAgent": {

      },
      "displayText": "Running Knowledge Loading from /otto8-tools/./knowledge/load.gpt",
      "end": "0001-01-01T00:00:00Z",
      "id": "1731701894",
      "input": "{\"input\":\".conversion/nginx.org/en/docs/stream/ngx_stream_access_module.html.md\",\"output\":\".conversion/nginx.org/en/docs/stream/ngx_stream_access_module.html.md.json\"}",
      "inputContext": null,
      "llmRequest": {
        "command": [
          "/bin/sh",
          "-c",
          "exec ${GPTSCRIPT_TOOL_DIR}/bin/gptscript-go-tool load --flows-file=blueprint:otto --flow=ottoload \"ws://${INPUT}\" \"ws://${OUTPUT}\""
        ],
        "input": "{\"input\":\".conversion/nginx.org/en/docs/stream/ngx_stream_access_module.html.md\",\"output\":\".conversion/nginx.org/en/docs/stream/ngx_stream_access_module.html.md.json\"}"
      },
      "llmResponse": null,
      "output": null,
      "start": "2024-11-15T19:16:29.340607531Z",
      "tool": {
        "arguments": {
          "properties": {
            "Input": {
              "description": "Input File",
              "type": "string"
            },
            "Output": {
              "description": "Output File",
              "type": "string"
            },
            "know_load_metadata": {
              "description": "Comma-delimited key=value pairs to be added to the metadata of the loaded document.",
              "type": "string"
            }
          },
          "type": "object"
        },
        "description": "Load a document, convert it to Knowledge JSON Format and store it to a file.",
        "id": "/otto8-tools/./knowledge/load.gpt:Knowledge Loading",
        "instructions": "#!${GPTSCRIPT_TOOL_DIR}/bin/gptscript-go-tool load --flows-file=blueprint:otto --flow=ottoload \"ws://${INPUT}\" \"ws://${OUTPUT}\"",
        "internalPrompt": null,
        "localTools": {
          "knowledge loading": "/otto8-tools/./knowledge/load.gpt:Knowledge Loading"
        },
        "modelName": "gpt-4o",
        "name": "Knowledge Loading",
        "source": {
          "lineNo": 1,
          "location": "/otto8-tools/./knowledge/load.gpt"
        },
        "workingDir": "/otto8-tools/knowledge"
      },
      "toolResults": 0,
      "type": "callChat",
      "usage": {

      }
    }
  },
  "spec": {
    "synchronous": true,
    "threadName": "t1-ks1fbwp4",
    "input": "{\"input\":\".conversion/nginx.org/en/docs/stream/ngx_stream_access_module.html.md\",\"output\":\".conversion/nginx.org/en/docs/stream/ngx_stream_access_module.html.md.json\"}",
    "tool": "\"knowledge-load\""
  },
  "status": {
    "state": "error",
    "output": "",
    "endTime": "2024-11-15T19:16:29Z",
    "error": "run encountered an error: failed to read events: context canceled with error output: "
  }
}