bytedance / bhook

:fire: ByteHook is an Android PLT hook library which supports armeabi-v7a, arm64-v8a, x86 and x86_64.
https://github.com/bytedance/bhook/tree/main/doc#readme
MIT License
2.05k stars 315 forks source link

native crash at android 13 #71

Closed hongyang-hua-20230331 closed 1 year ago

hongyang-hua-20230331 commented 1 year ago

bytehook Version

1.0.6

Android OS Version

13

Android ABIs

armeabi-v7a, arm64-v8a

Device Manufacturers and Models

samsung b0q/samsung r8q/ samsung a32

Describe the Bug

我们使用bhook来hook lock的write方法从而监听lock信息,使用如下:

void LockMonitor::LogLockTrace(const void* buf){
    const char* msg = reinterpret_cast<const char*>(buf);

    // B|23086|monitor contention with owner testlock (23548) at void java.lang.Thread.sleep(java.lang.Object, long, int)(Thread.java:-2) waiters=0 blocking from void com.example.nativehookdemo.MainActivity.testLock2$lambda-3(com.example.nativehookdemo.MainActivity)(MainActivity.kt:38)-23086-23086
    // 1 thread name -> owner
    // 2 thread id -> owner name
    // 3 invoke name -> owner method
    // 4 wait thread id ->
    // 5 wait thread name
    // 6 wait thread invoke name

    // duration =
    // start: key = threadname-threadid
    // end: key = threadname-threadid

    switch (msg[0]) {
        // begin
        case 'B': {
            std::string msgStr = msg;
            if (isLockMsg(msgStr)) {
                int pid = getpid();
                int tid = gettid();
                // filter非主线程
                if (pid != tid) {
//                    LOGD("not main thread lock start, process_id: %d, thread_id: %d", pid, tid);
                    return;
                }
                double time = std::chrono::duration_cast<std::chrono::milliseconds>(
                        std::chrono::steady_clock::now().time_since_epoch()).count();
//                LOGD("startTime ms:%s", std::to_string(time).c_str());
                std::string key = std::to_string(pid) + "-" + std::to_string(tid);
                std::string currentThreadName = GetThreadName(tid);

                LockContext *lockContext = new LockContext();
                lockContext->set_wait_thread_id(tid);
                lockContext->set_wait_thread_name(currentThreadName);
                lockContext->set_begin_time_ms(time);
                parseMsgToLockContext(msg, lockContext);
                map_lock_context_[key] = lockContext;
                Notify_java_delay_sample_main_thread_stack(lockContext->get_owner_thread_name(),
                                                           lockContext->get_owner_thread_id());
            }
            break;
        }
        case 'E': {
            int pid = getpid();
            int tid = gettid();
            // filter非主线程
            if (pid != tid) {
                //LOGD("not main thread lock end, process_id: %d, thread_id: %d", pid, tid);
                return;
            }
            double time = std::chrono::duration_cast<std::chrono::milliseconds>(
                    std::chrono::steady_clock::now().time_since_epoch()).count();
            std::string key = std::to_string(pid) + "-" + std::to_string(tid);
            std::map<std::string, LockContext*>::iterator lock_iter;
            lock_iter = map_lock_context_.find(key);
            if (lock_iter != map_lock_context_.end()) {
//                LOGD("endTime ms:%s", std::to_string(time).c_str());
                LockContext* preLockContext = lock_iter->second;
                preLockContext->set_end_time_ms(time);
                // 一个begin->end周期结束 删除掉map中的数据
                map_lock_context_.erase(lock_iter);
                Report(*preLockContext);
                delete preLockContext;
            }
            break;
        }
    }

}

size_t WriteProxy(int fd, const void *buf, size_t count)
{
    // 执行 stack 清理(不可省略)
    BYTEHOOK_STACK_SCOPE();

    if (LockMonitor::Get().isATraceFd(fd)) {
        LockMonitor::Get().LogLockTrace(buf);
    }
    size_t result = BYTEHOOK_CALL_PREV(WriteProxy, fd, buf, count);
    return result;
}

int LockMonitor::LockMonitor::DoHook()
{
    LOGD("hook start");
    try {
        stub_ = bytehook_hook_all(nullptr,
                                  "write",
                                  reinterpret_cast<void *>(WriteProxy),
                                  nullptr,
                                  nullptr);
        LOGD("hook success");
        hooked_ = true;
        return OK;
    } catch (...) {
        return HOOK_FAILED;
    }

}

线上出现了一些android13的native崩溃,崩溃堆栈信息如下:

Thread Check failed: checkpoint != nullptr Checkpoint flag set without pending checkpoint

backtrace:
  #00  pc 0x0000000000051ce8  /apex/com.android.runtime/lib64/bionic/libc.so (abort+164)
  #01  pc 0x00000000006f9f28  /apex/com.android.art/lib64/libart.so (art::Runtime::Abort(char const*)+472)
  #02  pc 0x0000000000016ea8  /apex/com.android.art/lib64/libbase.so (android::base::SetAborter(std::__1::function<void (char const*)>&&)::$_3::__invoke(char const*)+80)
  #03  pc 0x0000000000016450  /apex/com.android.art/lib64/libbase.so (android::base::LogMessage::~LogMessage()+352)
  #04  pc 0x0000000000426030  /apex/com.android.art/lib64/libart.so (art::Thread::RunCheckpointFunction()+364)
  #05  pc 0x00000000003bf274  /apex/com.android.art/lib64/libart.so (void art::Monitor::Lock<(art::LockReason)1>(art::Thread*)+6852)
  #06  pc 0x00000000003bd3a4  /apex/com.android.art/lib64/libart.so (artLockObjectFromCode+352)
  #07  pc 0x0000000000458024  /apex/com.android.art/lib64/libart.so (art_quick_lock_object_no_inline+52)
  #08  pc 0x0000000000ad4824  /data/misc/apexdata/com.android.art/dalvik-cache/arm64/boot.oat (android.os.MessageQueue.next+324)
  #09  pc 0x0000000000ad0be8  /data/misc/apexdata/com.android.art/dalvik-cache/arm64/boot.oat (android.os.Looper.loopOnce+104)
  #10  pc 0x0000000000ad0a4c  /data/misc/apexdata/com.android.art/dalvik-cache/arm64/boot.oat (android.os.Looper.loop+1148)
  #11  pc 0x0000000001a4c3bc  /data/app/~~o4L7s6XTZ3Vo1Db_2KCBGw==/com.xxx.xxx-EzLIXXlEef1QEZ1c_I2GmA==/oat/arm64/base.odex (com.infra.crashhunter.CrashHunter$init$$inlined$let$lambda$1.run+60)
  #12  pc 0x0000000000acced4  /data/misc/apexdata/com.android.art/dalvik-cache/arm64/boot.oat (android.os.Handler.dispatchMessage+84)
  #13  pc 0x0000000000ad0f88  /data/misc/apexdata/com.android.art/dalvik-cache/arm64/boot.oat (android.os.Looper.loopOnce+1032)
  #14  pc 0x0000000000ad0a4c  /data/misc/apexdata/com.android.art/dalvik-cache/arm64/boot.oat (android.os.Looper.loop+1148)
  #15  pc 0x00000000007d9b18  /data/misc/apexdata/com.android.art/dalvik-cache/arm64/boot.oat (android.app.ActivityThread.main+1480)
  #16  pc 0x0000000000457e00  /apex/com.android.art/lib64/libart.so (art_quick_invoke_static_stub+576)
  #17  pc 0x000000000048c038  /apex/com.android.art/lib64/libart.so (_jobject* art::InvokeMethod<(art::PointerSize)8>(art::ScopedObjectAccessAlreadyRunnable const&, _jobject*, _jobject*, _jobject*, unsigned long)+1560)
  #18  pc 0x000000000048b9f8  /apex/com.android.art/lib64/libart.so (art::Method_invoke(_JNIEnv*, _jobject*, _jobject*, _jobjectArray*) (.__uniq.165753521025965369065708152063621506277)+48)
  #19  pc 0x00000000002f2148  /data/misc/apexdata/com.android.art/dalvik-cache/arm64/boot.oat (art_jni_trampoline+120)
  #20  pc 0x0000000000a38560  /data/misc/apexdata/com.android.art/dalvik-cache/arm64/boot.oat (com.android.internal.os.RuntimeInit$MethodAndArgsCaller.run+144)
  #21  pc 0x0000000000a43404  /data/misc/apexdata/com.android.art/dalvik-cache/arm64/boot.oat (com.android.internal.os.ZygoteInit.main+3604)
  #22  pc 0x0000000000457e00  /apex/com.android.art/lib64/libart.so (art_quick_invoke_static_stub+576)
  #23  pc 0x000000000058bbc4  /apex/com.android.art/lib64/libart.so (art::JValue art::InvokeWithVarArgs<_jmethodID*>(art::ScopedObjectAccessAlreadyRunnable const&, _jobject*, _jmethodID*, std::__va_list)+912)
  #24  pc 0x0000000000609af8  /apex/com.android.art/lib64/libart.so (art::JNI<true>::CallStaticVoidMethodV(_JNIEnv*, _jclass*, _jmethodID*, std::__va_list)+172)
  #25  pc 0x00000000000c0ad0  /system/lib64/libandroid_runtime.so (_JNIEnv::CallStaticVoidMethod(_jclass*, _jmethodID*, ...)+120)
  #26  pc 0x00000000000ccdf8  /system/lib64/libandroid_runtime.so (android::AndroidRuntime::start(char const*, android::Vector<android::String8> const&, bool)+852)
  #27  pc 0x0000000000002568  /system/bin/app_process64 (main+1300)
  #28  pc 0x000000000004a288  /apex/com.android.runtime/lib64/bionic/libc.so (__libc_init+96)
caikelun commented 1 year ago
  1. 从崩溃 backtrace 看不出和 bytehook 有直接的关系,只能从侧面验证排查。
  2. write() 的返回值类型是 ssize_t,不是 size_t,这个首先需要改掉。
  3. 如果线上还有问题,可以把 LogLockTrace 函数当中的 JNI 调用先去掉,甚至把 C++ std 也都去掉,先验证稳定性。因为 hook 后插入的 proxy 逻辑对于原逻辑是个“意外”,比如 JNI 系统流程中如果有 write,这时被你 hook 了,在 proxy 中又调用了 JNI 函数,这种行为可能破坏了原逻辑中的一些逻辑假设。
hongyang-hua-20230331 commented 1 year ago

感谢回复🙏,我试试看