hhyo / Archery

SQL 审核查询平台
https://archerydms.com
Apache License 2.0
6.01k stars 1.68k forks source link

清理django_q_task表以后工单无法成功执行 #2023

Closed tanglu86 closed 1 year ago

tanglu86 commented 1 year ago

重现步骤

我们公司部署的Archery已经运行了2年多,django_q_task表数据量过大,目前有50多G,在测试环境对该表truncate后发现所有工单都无法执行成功,均为排队中的状态,尝试过以下排错无果: 1、archery.log、qcluster.log、supervisord.log 三个日志没有报错信息 2、goInception中通过inception show processlist看不到工单 3、Failed task里没有失败的任务 4、python manage.py qmonitor是正常RUNNING状态

预期外的结果

1 2 3

日志文本

No response

版本

1.8.4

部署方式

手工部署

是否还有其他可以辅助定位问题的信息?比如数据库版本等

MySQL: 5.7.30 Browsers: Chrome goInception: v1.3.0-42

nick2wang commented 1 year ago

清空django_q_task并不会影响后续任务的执行,看下debug信息里的qcluser状态 http://your_archery_host/api/debug

{
    "archery": {
        "version": "1.9.1"
    },
    "django_q": {
        "version": "1.3.9",
        "conf": {
            "name": "archery",
            "workers": 4,
            "recycle": 500,
            "timeout": 60,
            "compress": true,
            "cpu_affinity": 1,
            "save_limit": 0,
            "queue_limit": 50,
            "label": "Django Q",
            "django_redis": "default",
            "sync": false
        },
        "q_cluster_stats": [
            {
                "host": "test",
                "cluster_id": "xxx",
                "state": "Idle",
                "pool": 2,
                "tq": 0,
                "rq": 0,
                "rc": 0,
                "up": "5:17:19"
            }
        ],
        "q_broker_stats": {
            "info": "Redis 5.0.13",
            "Queued": 0,
            "Success": 274,
            "Failures": 1
        }
    }
}
tanglu86 commented 1 year ago

q_cluster_stats

{ "archery": { "version": "1.8.4" }, "django_q": { "version": "1.3.4", "conf": { "name": "archery", "workers": 4, "recycle": 30, "timeout": 600, "compress": true, "cpu_affinity": 1, "save_limit": 0, "queue_limit": 20, "label": "Django Q", "django_redis": "default", "sync": false }, "q_cluster_stats": [{ "host": "archery-desensitization-test-ns-172-20-0-236", "cluster_id": "88d88cca-6140-4299-9b1d-fd2b5877d881", "state": "Working", "pool": 4, "tq": 20, "rq": 68, "rc": 19, "up": "1:03:50" }], "q_broker_stats": { "info": "Redis 3.2.9", "Queued": 1019, "Success": 1107, "Failures": 1 } }, "inception": { "enable_inception": null, "inception_info": "\u83b7\u53d6Inception\u4fe1\u606f\u62a5\u9519:connect() argument 1 must be str, not None", "goinception_info": { "version": "v1.3.0-42-g169e116", "max_allowed_packet": "4194304", "lang": "zh_cn", "osc_on": "false", "osc_bin_dir": "/usr/local/bin", "ghost_on": "false" }, "backup_info": "\u65e0\u6cd5\u8fde\u63a5Inception\u5907\u4efd\u5e93\n(1045, \"Access denied for user 'it'@'archery-desensitization-test-ns-172-20-0-236' (using password: YES)\")" }, "runtime_info": { "python_version": "3.9.10", "mysql_info": { "mysql_server_info": "5.7.30-log", "timezone_name": "Asia/Shanghai" }, "redis_info": { "redis_version": "3.2.9", "redis_mode": "standalone", "role": "master", "maxmemory_human": "0B", "used_memory_human": "315.34M" }, "sys_argv": ["/root/venv4archery184/bin/gunicorn", "-w", "2", "-b", "127.0.0.1:8000", "--timeout", "600", "archery.wsgi:application"], "platform": ["Linux", "archery-desensitization-test-ns-172-20-0-236", "4.4.180-1.el7.elrepo.x86_64", "#1 SMP Thu May 16 17:40:04 EDT 2019", "x86_64", "x86_64"] }, "sys_config": { "admin_query_limit": "1000", "api_user_whitelist": "", "archery_base_url": "", "auto_review": "", "auto_review_db_type": "", "auto_review_max_update_rows": "", "auto_review_regex": "", "auto_review_tag": "", "auto_review_wrong": "", "binlog2sql": "", "critical_ddl_regex": "", "data_masking": true, "ddl_notify_auth_group": "", "ddl_threshold": "", "default_auth_group": "", "default_resource_group": "\u6d4b\u8bd5", "ding": "", "ding_agent_id": "", "ding_app_key": "", "ding_app_secret": "**", "ding_archery_username": "", "ding_dept_ids": "", "ding_to_person": "", "disable_star": "", "dml_threshold": "100", "enable_backup_switch": "", "feishu": "", "feishu_appid": "", "feishu_app_secret": "**", "feishu_webhook": "", "go_inception_host": "172.20.0.236", "go_inception_port": "4000", "inception_remote_backup_host": "172.20.0.236", "inception_remote_backup_password": "**", "inception_remote_backup_port": "3306", "inception_remote_backup_user": "it", "index_path_url": "", "lock_cnt_threshold": "", "lock_time_threshold": "", "mail": "", "mail_smtp_password": "**", "mail_smtp_port": "", "mail_smtp_server": "", "mail_smtp_user": "", "mail_ssl": "", "manual": "", "max_execution_time": "", "my2sql": "", "notify_phase_control": "Apply,Pass,Execute,Cancel", "query_check": true, "qywx_webhook": "", "sign_up_enabled": true, "soar": "", "soar_test_dsn": "", "sqladvisor": "", "watermark_enabled": "", "wx": "", "wx_agent_id": "", "wx_app_secret": "", "wx_corpid": "" }, "packages": ["aliyun-python-sdk-core-v3==2.13.33", "aliyun-python-sdk-rds==2.1.1", "arrow==1.2.3", "asgiref==3.6.0", "async-timeout==4.0.2", "attrs==22.2.0", "bcrypt==4.0.1", "blessed==1.19.1", "certifi==2022.12.7", "cffi==1.15.1", "chardet==3.0.4", "click==7.1.2", "clickhouse-driver==0.2.3", "cryptography==39.0.0", "cx-oracle==7.3.0", "django-auth-ldap==2.2.0", "django-filter==21.1", "django-mirage-field==1.1.6", "django-picklefield==3.0.1", "django-q==1.3.4", "django-redis==4.12.1", "django==3.1.14", "djangorestframework-simplejwt==4.3.0", "djangorestframework==3.12.4", "drf-spectacular==0.22.0", "future==0.18.2", "gunicorn==20.0.4", "h11==0.14.0", "httptools==0.1.1", "idna==2.10", "inflection==0.5.1", "jinja2==3.1.2", "jmespath==0.10.0", "jsonschema==4.17.3", "markupsafe==2.1.1", "mybatis-mapper2sql==0.1.9", "mysql-replication==0.22", "mysqlclient==2.0.1", "numpy==1.24.1", "pandas==1.1.5", "paramiko==2.12.0", "parsedatetime==2.4", "phoenixdb==0.7", "pillow==8.4.0", "pip==22.3.1", "prettytable==3.5.0", "protobuf==4.21.12", "psycopg2-binary==2.8.6", "pyasn1-modules==0.2.8", "pyasn1==0.4.8", "pycparser==2.21", "pycryptodome==3.10.1", "pyecharts==1.7.1", "pyjwt==1.7.1", "pymongo==3.11.0", "pymysql==0.9.3", "pynacl==1.5.0", "pyodbc==4.0.30", "pyodps==0.10.7.1", "pyotp==2.6.0", "pyrsistent==0.19.3", "python-dateutil==2.8.1", "python-ldap==3.4.3", "pytz-deprecation-shim==0.1.0.post0", "pytz==2022.7", "pyyaml==6.0", "qrcode==7.3.1", "redis==4.4.0", "requests==2.24.0", "schema-object==0.5.11", "schema-sync==0.9.7", "setuptools==65.6.3", "simplejson==3.17.2", "six==1.16.0", "sqlparse==0.4.3", "sshtunnel==0.1.5", "supervisor==4.1.0", "tqdm==4.64.1", "tzdata==2022.7", "tzlocal==4.2", "uritemplate==4.1.1", "urllib3==1.25.11", "uvicorn==0.12.2", "uvloop==0.14.0", "wcwidth==0.2.5", "wheel==0.38.4"] }

hhyo commented 1 year ago

Queued这么多,都是排队状态,worker消费不过来,可以增加worker数,真不行可以清一下队列重新执行

tanglu86 commented 1 year ago

调查清楚了,我们自己增加的自动脱敏任务创建了太多任务导致的,谢谢作者