SerenityOS / serenity

The Serenity Operating System 🐞
https://serenityos.org
BSD 2-Clause "Simplified" License
30.4k stars 3.18k forks source link

Kernel: purge() incorrectly clears dirty pages #15951

Open gunnarbeutner opened 1 year ago

gunnarbeutner commented 1 year ago

When clearing pages purge() drops dirty pages for inode VM objects because we're not tracking whether a page was dirtied. This can be observed by calling purge -c when the following test program is running:

#include <LibMain/Main.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <unistd.h>

ErrorOr<int> serenity_main(Main::Arguments)
{
    int fd = open("/etc/fstab", O_RDONLY);
    char* p = (char*)mmap(nullptr, 4096, PROT_READ | PROT_WRITE, MAP_FILE | MAP_PRIVATE, fd, 0);
    close(fd);
    uintptr_t zero { 0 };
    memcpy(p, &zero, sizeof(zero));
    for (;;) {
        sleep(1);
        VERIFY(memcmp(p, &zero, sizeof(zero)) == 0);
    }
    return 0;
}
gunnarbeutner commented 1 year ago

I'm not particularly happy with this patch but here's a general idea of what's necessary:

commit e9e2fe4a952f29dfe4ba4ac3a3c4ca5381be1b91
Author: Gunnar Beutner <gbeutner@serenityos.org>
Date:   Sat Nov 5 12:46:30 2022 +0100

    WIP+Kernel: Track dirty pages for inode VM objects

diff --git a/Kernel/Memory/InodeVMObject.cpp b/Kernel/Memory/InodeVMObject.cpp
index d0fbd558b1..80edce8add 100644
--- a/Kernel/Memory/InodeVMObject.cpp
+++ b/Kernel/Memory/InodeVMObject.cpp
@@ -48,6 +48,16 @@ size_t InodeVMObject::amount_dirty() const
     return count * PAGE_SIZE;
 }

+bool InodeVMObject::is_page_dirty(size_t page_index) const
+{
+    return m_dirty_pages.get(page_index);
+}
+
+void InodeVMObject::mark_page_dirty(size_t page_index)
+{
+    m_dirty_pages.set(page_index, true);
+}
+
 int InodeVMObject::release_all_clean_pages()
 {
     SpinlockLocker locker(m_lock);
diff --git a/Kernel/Memory/InodeVMObject.h b/Kernel/Memory/InodeVMObject.h
index 2b439b32fd..d796333096 100644
--- a/Kernel/Memory/InodeVMObject.h
+++ b/Kernel/Memory/InodeVMObject.h
@@ -22,6 +22,9 @@ public:
     size_t amount_dirty() const;
     size_t amount_clean() const;

+    bool is_page_dirty(size_t) const;
+    void mark_page_dirty(size_t);
+
     int release_all_clean_pages();
     int try_release_clean_pages(int page_amount);

diff --git a/Kernel/Memory/Region.cpp b/Kernel/Memory/Region.cpp
index 4dd3e6c102..19c72fbb13 100644
--- a/Kernel/Memory/Region.cpp
+++ b/Kernel/Memory/Region.cpp
@@ -189,6 +189,8 @@ ErrorOr<NonnullOwnPtr<Region>> Region::try_create_user_accessible(VirtualRange c

 bool Region::should_cow(size_t page_index) const
 {
+    if (vmobject().is_inode())
+        return !static_cast<InodeVMObject const&>(vmobject()).is_page_dirty(first_page_index() + page_index);
     if (!vmobject().is_anonymous())
         return false;
     return static_cast<AnonymousVMObject const&>(vmobject()).should_cow(first_page_index() + page_index, m_shared);
@@ -387,10 +389,12 @@ PageFaultResponse Region::handle_fault(PageFault const& fault)
     VERIFY(fault.type() == PageFault::Type::ProtectionViolation);
     if (fault.access() == PageFault::Access::Write && is_writable() && should_cow(page_index_in_region)) {
         dbgln_if(PAGE_FAULT_DEBUG, "PV(cow) fault in Region({})[{}] at {}", this, page_index_in_region, fault.vaddr());
-        auto phys_page = physical_page(page_index_in_region);
-        if (phys_page->is_shared_zero_page() || phys_page->is_lazy_committed_page()) {
-            dbgln_if(PAGE_FAULT_DEBUG, "NP(zero) fault in Region({})[{}] at {}", this, page_index_in_region, fault.vaddr());
-            return handle_zero_fault(page_index_in_region, *phys_page);
+        if (!vmobject().is_inode()) {
+            auto phys_page = physical_page(page_index_in_region);
+            if (phys_page->is_shared_zero_page() || phys_page->is_lazy_committed_page()) {
+                dbgln_if(PAGE_FAULT_DEBUG, "NP(zero) fault in Region({})[{}] at {}", this, page_index_in_region, fault.vaddr());
+                return handle_zero_fault(page_index_in_region, *phys_page);
+            }
         }
         return handle_cow_fault(page_index_in_region);
     }
@@ -452,11 +456,18 @@ PageFaultResponse Region::handle_cow_fault(size_t page_index_in_region)
     if (current_thread)
         current_thread->did_cow_fault();

-    if (!vmobject().is_anonymous())
+    if (!vmobject().is_anonymous() && !vmobject().is_inode())
         return PageFaultResponse::ShouldCrash;

+    PageFaultResponse response;
     auto page_index_in_vmobject = translate_to_vmobject_page(page_index_in_region);
-    auto response = reinterpret_cast<AnonymousVMObject&>(vmobject()).handle_cow_fault(page_index_in_vmobject, vaddr().offset(page_index_in_region * PAGE_SIZE));
+    if (vmobject().is_inode()) {
+        reinterpret_cast<InodeVMObject&>(vmobject()).mark_page_dirty(page_index_in_vmobject);
+        dbgln("mark_page_dirty");
+        response = PageFaultResponse::Continue;
+    } else {
+        response = reinterpret_cast<AnonymousVMObject&>(vmobject()).handle_cow_fault(page_index_in_vmobject, vaddr().offset(page_index_in_region * PAGE_SIZE));
+    }
     if (!remap_vmobject_page(page_index_in_vmobject, *vmobject().physical_pages()[page_index_in_vmobject]))
         return PageFaultResponse::OutOfMemory;
     return response;
diff --git a/Kernel/Memory/SharedInodeVMObject.cpp b/Kernel/Memory/SharedInodeVMObject.cpp
index 29b2095d13..1eda04383d 100644
--- a/Kernel/Memory/SharedInodeVMObject.cpp
+++ b/Kernel/Memory/SharedInodeVMObject.cpp
@@ -64,6 +64,7 @@ ErrorOr<void> SharedInodeVMObject::sync(off_t offset_in_pages, size_t pages)
         MM.copy_physical_page(*physical_page, page_buffer);

         TRY(m_inode->write_bytes(page_index * PAGE_SIZE, PAGE_SIZE, UserOrKernelBuffer::for_kernel_buffer(page_buffer), nullptr));
+        m_dirty_pages.set(page_index, false);
     }

     return {};