baruch / diskscan

Scan disk for bad or near failure sectors, performs disk diagnostics
GNU General Public License v3.0
108 stars 29 forks source link

Diskscan hangs on Fedora 25 #58

Open deamen opened 7 years ago

deamen commented 7 years ago

When I try to run diskscan /dev/sdc, it hangs at "I: Validating path /dev/sdc"

The strace log as below:

execve("./diskscan", ["./diskscan", "/dev/sdc"], [/* 20 vars */]) = 0
brk(NULL)                               = 0xd3c000
mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f42c9ad3000
access("/etc/ld.so.preload", R_OK)      = -1 ENOENT (No such file or directory)
open("/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=29307, ...}) = 0
mmap(NULL, 29307, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f42c9acb000
close(3)                                = 0
open("/lib64/libm.so.6", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\3\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0 W\0\0\0\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=1169408, ...}) = 0
mmap(NULL, 3178728, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f42c95a7000
mprotect(0x7f42c96af000, 2093056, PROT_NONE) = 0
mmap(0x7f42c98ae000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x107000) = 0x7f42c98ae000
close(3)                                = 0
open("/lib64/libtinfo.so.6", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\240\331\0\0\0\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=180200, ...}) = 0
mmap(NULL, 2275584, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f42c937b000
mprotect(0x7f42c93a2000, 2097152, PROT_NONE) = 0
mmap(0x7f42c95a2000, 20480, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x27000) = 0x7f42c95a2000
close(3)                                = 0
open("/lib64/libz.so.1", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0` \0\0\0\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=89520, ...}) = 0
mmap(NULL, 2183272, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f42c9165000
mprotect(0x7f42c917a000, 2093056, PROT_NONE) = 0
mmap(0x7f42c9379000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x14000) = 0x7f42c9379000
close(3)                                = 0
open("/lib64/librt.so.1", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\340 \0\0\0\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=44040, ...}) = 0
mmap(NULL, 2128800, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f42c8f5d000
mprotect(0x7f42c8f64000, 2093056, PROT_NONE) = 0
mmap(0x7f42c9163000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x6000) = 0x7f42c9163000
close(3)                                = 0
open("/lib64/libc.so.6", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\3\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0 \5\2\0\0\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=2115824, ...}) = 0
mmap(NULL, 3955040, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f42c8b97000
mprotect(0x7f42c8d54000, 2093056, PROT_NONE) = 0
mmap(0x7f42c8f53000, 24576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x1bc000) = 0x7f42c8f53000
mmap(0x7f42c8f59000, 14688, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f42c8f59000
close(3)                                = 0
open("/lib64/libpthread.so.0", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\240`\0\0\0\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=151104, ...}) = 0
mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f42c9ac9000
mmap(NULL, 2217000, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f42c8979000
mprotect(0x7f42c8991000, 2097152, PROT_NONE) = 0
mmap(0x7f42c8b91000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x18000) = 0x7f42c8b91000
mmap(0x7f42c8b93000, 13352, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f42c8b93000
close(3)                                = 0
mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f42c9ac7000
arch_prctl(ARCH_SET_FS, 0x7f42c9ac7700) = 0
mprotect(0x7f42c8f53000, 16384, PROT_READ) = 0
mprotect(0x7f42c8b91000, 4096, PROT_READ) = 0
mprotect(0x7f42c9163000, 4096, PROT_READ) = 0
mprotect(0x7f42c9379000, 4096, PROT_READ) = 0
mprotect(0x7f42c95a2000, 16384, PROT_READ) = 0
mprotect(0x7f42c98ae000, 4096, PROT_READ) = 0
mprotect(0x617000, 4096, PROT_READ)     = 0
mprotect(0x7f42c9ad5000, 4096, PROT_READ) = 0
munmap(0x7f42c9acb000, 29307)           = 0
set_tid_address(0x7f42c9ac79d0)         = 4494
set_robust_list(0x7f42c9ac79e0, 24)     = 0
rt_sigaction(SIGRTMIN, {sa_handler=0x7f42c897eb40, sa_mask=[], sa_flags=SA_RESTORER|SA_SIGINFO, sa_restorer=0x7f42c898a5c0}, NULL, 8) = 0
rt_sigaction(SIGRT_1, {sa_handler=0x7f42c897ebd0, sa_mask=[], sa_flags=SA_RESTORER|SA_RESTART|SA_SIGINFO, sa_restorer=0x7f42c898a5c0}, NULL, 8) = 0
rt_sigprocmask(SIG_UNBLOCK, [RTMIN RT_1], NULL, 8) = 0
getrlimit(RLIMIT_STACK, {rlim_cur=8192*1024, rlim_max=RLIM64_INFINITY}) = 0
fstat(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(136, 1), ...}) = 0
brk(NULL)                               = 0xd3c000
brk(0xd5d000)                           = 0xd5d000
brk(NULL)                               = 0xd5d000
write(1, "diskscan version 0.19\n\n", 23) = 23
rt_sigaction(SIGINT, {sa_handler=0x404033, sa_mask=[], sa_flags=SA_RESTORER|SA_RESTART, sa_restorer=0x7f42c8bcc990}, NULL, 8) = 0
rt_sigaction(SIGTERM, {sa_handler=0x404033, sa_mask=[], sa_flags=SA_RESTORER|SA_RESTART, sa_restorer=0x7f42c8bcc990}, NULL, 8) = 0
write(1, "I: Validating path /dev/sdc\n", 28) = 28
access("/dev/sdc", F_OK)                = 0
access("/dev/sdc", R_OK)                = 0
open("/dev/sdc", O_RDWR|O_DIRECT)       = 3
ioctl(3, SG_IO, {interface_id='S', dxfer_direction=SG_DXFER_FROM_DEV, cmd_len=10, cmdp="\x25\x00\x00\x00\x00\x00\x00\x00\x00\x00", mx_sb_len=128, iovec_count=0, dxfer_len=512, timeout=600000, flags=SG_FLAG_UNUSED_LUN_INHIBIT, dxferp="\x2e\x93\x90\xaf\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"..., status=0, masked_status=0, msg_status=0, sb_len_wr=0, sbp="", host_status=0, driver_status=0, resid=0, duration=0, info=0}) = 0
ioctl(3, SG_IO, {interface_id='S', dxfer_direction=SG_DXFER_FROM_DEV, cmd_len=6, cmdp="\x12\x00\x00\x00\x60\x00", mx_sb_len=128, iovec_count=0, dxfer_len=512, timeout=600000, flags=SG_FLAG_UNUSED_LUN_INHIBIT, dxferp="\x00\x00\x06\x12\x45\x00\x00\x02\x41\x54\x41\x20\x20\x20\x20\x20\x53\x54\x33\x34\x30\x30\x36\x32\x30\x4e\x53\x20\x20\x20\x20\x20"..., status=0, masked_status=0, msg_status=0, sb_len_wr=0, sbp="", host_status=0, driver_status=0, resid=438, duration=1, info=0}) = 0
ioctl(3, SG_IO, {interface_id='S', dxfer_direction=SG_DXFER_FROM_DEV, cmd_len=12, cmdp="\xa1\x0c\x0e\x00\x01\x00\x00\x00\x00\xec\x00\x00", mx_sb_len=128, iovec_count=0, dxfer_len=512, timeout=600000, flags=SG_FLAG_UNUSED_LUN_INHIBIT
baruch commented 7 years ago

What drive is this? Can you show the output of smartctl -a /dev/sdc?

It looks like some diagnostics command didn't return from the drive.

baruch commented 7 years ago

No idea why it got hanged like that but I probably should set the timeout to a lower value in any case to avoid it hanging.

baruch commented 7 years ago

In addition to the requested information from above, can you please try the head of the master branch? I reduced the timeout for the stuck IO to 5 seconds but I have no way to test it locally.

deamen commented 7 years ago

Hi Baruch, The HDD is a Seagate SATA HDD, ST3400620NS.

The hanging issue only happens when the drive is connected to a LSI SAS HBA, when I connect the drive to an on-board SATA port diskscan works fine. The LSI SAS HBA information: 02:00.0 Serial Attached SCSI controller: LSI Logic / Symbios Logic SAS2008 PCI-Express Fusion-MPT SAS-2 [Falcon] (rev 03)

I will try your timeout change later today and report back to you.

The HDD information as below: hdd.txt Thanks

deamen commented 7 years ago

Hi Baruch, I confirm the timeout commit (cd596a1) works, I can cancel diskscan with ctrl + c now:

diskscan version 0.19

I: Validating path /dev/sdc
E: Failed to read SMART attributes from device
I: Opened disk /dev/sdc sector size 512 num bytes 5000981077504
I: Scanning disk /dev/sdc in 65536 byte steps
I: Scan started at: Thu Apr 13 09:01:22 2017

Disk scan |                                                                                                                  | ETA: 0h00m00s
E: Error when reading at offset 0 size 65536 read -1, errno=0: Success

E: Details: error=need_retry data=none 06/29/00

E: Error when reading at offset 40501248 size 65536 read -1, errno=0: Success

E: Details: error=need_retry data=none 06/29/00
^C^C^C^C^C^C^C^C^C^C^C^C^C^C^C^C^C^C^C^C^C^C^C^C
E: Failed to read SMART attributes from device
I: Disk scan interrupted
MasterCATZ commented 7 years ago

can we have a setting to set our own time out as 2ms seems to be my max timeout at the moment still takes a long time to try and cancel

even pkill -9 did not do much to finish it

baruch commented 7 years ago

I can add a command line option to set the timeout but a too low timeout will mean that it will be impossible to scan bad disks. I'll also try to think about what else can be done in such cases.