[Bug 445852] Re: devkit-disks-probe-ata-smart causes HSM Violations on SSD, and potential hardware death
Martin Pitt
martin.pitt at ubuntu.com
Fri Mar 26 09:00:36 UTC 2010
Alan Pope kindly provided ssh access to his affected machine, and I
analyzed this in detail.
I put my raw notes here for having a permanent record. I'll follow up
with a more human-readable status in the next comment, so unless you are
interested in the technical details, you can safely ignore this long
post.
Jean-Louis' theory: check PACKET Command feature
------------------------------------------------
* Both of my computers can do SMART just fine, but both also succeed with IDENTIFY_PACKET_DEVICE and deliver real data
* An affected machine responds to SMART commands just fine with current Ubuntu 10.04 beta-1 (and deliver sensible results), so they can do SMART
libatasmart 0.17+git20100219-1git2, udisks 1.0.0 (Ubuntu 10.04)
------------------------------------------------
WORKS: # strace -e ioctl /lib/udev/udisks-probe-ata-smart /dev/sda
ioctl(3, BLKGETSIZE64, 0x9525014) = 0
ioctl(3, SG_IO, {'S', SG_DXFER_FROM_DEV, cmd[16]=[85, 08, 2e, 00, 00, 00, 01, 00, 00, 00, 00, 00, 00, 00, ec, 00], mx_sb_len=32, iovec_count=0, dxfer_len=512, timeout=2000, flags=0, data[512]=["J\4\212\36\0\0\20\0\0~\0\2?\0x\0`?\0\0 "...], status=02, masked_status=01, sb[22]=[72, 00, 00, 00, 00, 00, 00, 0e, 09, 0c, 00, 00, 00, ff, 00, 00, 00, 00, 00, 00, 00, 50], host_status=0, driver_status=0x8, resid=0, duration=0, info=0x1}) = 0
UDISKS_ATA_SMART_IS_AVAILABLE=1
WORKS: # skdump /dev/sda
WORKS: # udisks --ata-smart-wakeup --ata-smart-refresh /dev/sda
libatasmart 0.17+git20100219-1git2, dk-disks 007
------------------------------------------------
WORKS: strace -e ioctl ./devkit-disks-probe-ata-smart /dev/sda
ioctl(3, BLKGETSIZE64, 0x9686014) = 0
ioctl(3, SG_IO, {'S', SG_DXFER_FROM_DEV, cmd[16]=[85, 08, 2e, 00, 00, 00, 01, 00, 00, 00, 00, 00, 00, 00, ec, 00], mx_sb_len=32, iovec_count=0, dxfer_len=512, timeout=2000, flags=0, data[512]=["J\4\212\36\0\0\20\0\0~\0\2?\0x\0`?\0\0 "...], status=02, masked_status=01, sb[22]=[72, 00, 00, 00, 00, 00, 00, 0e, 09, 0c, 00, 00, 00, ff, 00, 00, 00, 00, 00, 00, 00, 50], host_status=0, driver_status=0x8, resid=0, duration=4, info=0x1}) = 0
DKD_ATA_SMART_IS_AVAILABLE=1
libatasmart 0.16, udisks 1.0.0
------------------------------
FAILS: LD_LIBRARY_PATH=/home/pitti/libatasmart-karmic/lib/ strace -e ioctl /lib/udev/udisks-probe-ata-smart /dev/sda
ioctl(3, BLKGETSIZE64, 0x9fef014) = 0
ioctl(3, SG_IO, {'S', SG_DXFER_FROM_DEV, cmd[16]=[85, 08, 2e, 00, 00, 00, 01, 00, 00, 00, 00, 00, 00, 00, ec, 00], mx_sb_len=32, iovec_count=0, dxfer_len=512, timeout=2000, flags=0, data[512]=["J\4\212\36\0\0\20\0\0~\0\2?\0x\0`?\0\0 "...], status=02, masked_status=01, sb[22]=[72, 00, 00, 00, 00, 00, 00, 0e, 09, 0c, 00, 00, 00, ff, 00, 00, 00, 00, 00, 00, 00, 50], host_status=0, driver_status=0x8, resid=0, duration=25768, info=0x1}) = 0
ioctl(3, SG_IO, {'S', SG_DXFER_FROM_DEV, cmd[16]=[85, 08, 2e, 00, d1, 00, 01, 00, 00, 00, 4f, 00, c2, 00, b0, 00], mx_sb_len=32, iovec_count=0, dxfer_len=512, timeout=2000, flags=0, data[512]=["\20\0\350\2\0\0\0\0\0\0\0\0\0\0\351\0\0\0\0\0\0\0\0\0\0\0\352\0\0\0\0\0"...], status=02, masked_status=01, sb[22]=[72, 00, 00, 00, 00, 00, 00, 0e, 09, 0c, 00, 00, 00, 00, 00, 00, 00, 4f, 00, c2, 00, 50], host_status=0, driver_status=0x8, resid=0, duration=24, info=0x1}) = 0
UDISKS_ATA_SMART_IS_AVAILABLE=1
libatasmart 0.16
----------------
FAILS: # strace -e ioctl ./skdump --can-smart /dev/sda >/dev/null
ioctl(3, BLKGETSIZE64, 0x8afa014) = 0
ioctl(3, SG_IO, {'S', SG_DXFER_FROM_DEV, cmd[16]=[85, 08, 2e, 00, 00, 00, 01, 00, 00, 00, 00, 00, 00, 00, ec, 00], mx_sb_len=32, iovec_count=0, dxfer_len=512, timeout=2000, flags=0, data[512]=["J\4\212\36\0\0\20\0\0~\0\2?\0x\0`?\0\0 "...], status=02, masked_status=01, sb[22]=[72, 00, 00, 00, 00, 00, 00, 0e, 09, 0c, 00, 00, 00, ff, 00, 00, 00, 00, 00, 00, 00, 50], host_status=0, driver_status=0x8, resid=0, duration=0, info=0x1}) = 0
ioctl(3, SG_IO, {'S', SG_DXFER_FROM_DEV, cmd[16]=[85, 08, 2e, 00, d1, 00, 01, 00, 00, 00, 4f, 00, c2, 00, b0, 00], mx_sb_len=32, iovec_count=0, dxfer_len=512, timeout=2000, flags=0, data[512]=["\20\0\350\2\0\0\0\0\0\0\0\0\0\0\351\0\0\0\0\0\0\0\0\0\0\0\352\0\0\0\0\0"...], status=02, masked_status=01, sb[22]=[72, 00, 00, 00, 00, 00, 00, 0e, 09, 0c, 00, 00, 00, 00, 00, 00, 00, 4f, 00, c2, 00, 50], host_status=0, driver_status=0x8, resid=0, duration=4, info=0x1}) = 0
WORKS: strace -e ioctl ./skdump /dev/sda >/dev/null
ioctl(3, SNDCTL_TMR_TIMEBASE or TCGETS, 0xbff6ed88) = -1 ENOTTY (Inappropriate ioctl for device)
ioctl(3, BLKGETSIZE64, 0x91ba014) = 0
ioctl(3, SG_IO, {'S', SG_DXFER_FROM_DEV, cmd[16]=[85, 08, 2e, 00, 00, 00, 01, 00, 00, 00, 00, 00, 00, 00, ec, 00], mx_sb_len=32, iovec_count=0, dxfer_len=512, timeout=2000, flags=0, data[512]=["J\4\212\36\0\0\20\0\0~\0\2?\0x\0`?\0\0 "...], status=02, masked_status=01, sb[22]=[72, 00, 00, 00, 00, 00, 00, 0e, 09, 0c, 00, 00, 00, ff, 00, 00, 00, 00, 00, 00, 00, 50], host_status=0, driver_status=0x8, resid=0, duration=0, info=0x1}) = 0
ioctl(3, SG_IO, {'S', SG_DXFER_FROM_DEV, cmd[16]=[85, 08, 2e, 00, d1, 00, 01, 00, 00, 00, 4f, 00, c2, 00, b0, 00], mx_sb_len=32, iovec_count=0, dxfer_len=512, timeout=2000, flags=0, data[512]=["\20\0\350\2\0\0\0\0\0\0\0\0\0\0\351\0\0\0\0\0\0\0\0\0\0\0\352\0\0\0\0\0"...], status=02, masked_status=01, sb[22]=[72, 00, 00, 00, 00, 00, 00, 0e, 09, 0c, 00, 00, 00, 00, 00, 00, 00, 4f, 00, c2, 00, 50], host_status=0, driver_status=0x8, resid=0, duration=4, info=0x1}) = 0
ioctl(1, SNDCTL_TMR_TIMEBASE or TCGETS, 0xbfd3f864) = -1 ENOTTY (Inappropriate ioctl for device)
ioctl(3, SG_IO, {'S', SG_DXFER_NONE, cmd[16]=[85, 06, 20, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, e5, 00], mx_sb_len=32, iovec_count=0, dxfer_len=0, timeout=2000, flags=0, status=02, masked_status=01, sb[22]=[72, 00, 00, 00, 00, 00, 00, 0e, 09, 0c, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 50], host_status=0, driver_status=0x8, resid=0, duration=0, info=0x1}) = 0
ioctl(3, SG_IO, {'S', SG_DXFER_NONE, cmd[16]=[85, 06, 20, 00, da, 00, 00, 00, 00, 00, 4f, 00, c2, 00, b0, 00], mx_sb_len=32, iovec_count=0, dxfer_len=0, timeout=2000, flags=0, status=02, masked_status=01, sb[22]=[72, 00, 00, 00, 00, 00, 00, 0e, 09, 0c, 00, 00, 00, 00, 00, 00, 00, 4f, 00, c2, 00, 50], host_status=0, driver_status=0x8, resid=0, duration=0, info=0x1}) = 0
ioctl(3, SG_IO, {'S', SG_DXFER_FROM_DEV, cmd[16]=[85, 08, 2e, 00, d0, 00, 01, 00, 00, 00, 4f, 00, c2, 00, b0, 00], mx_sb_len=32, iovec_count=0, dxfer_len=512, timeout=2000, flags=0, data[512]=["\20\0\350\2\0ddd\0\0\0\0\0\0\351\2\0ddpy\6\0\0\0\0\352\2\0dd\262"...], status=02, masked_status=01, sb[22]=[72, 00, 00, 00, 00, 00, 00, 0e, 09, 0c, 00, 00, 00, 00, 00, 00, 00, 4f, 00, c2, 00, 50], host_status=0, driver_status=0x8, resid=0, duration=4, info=0x1}) = 0
ioctl(3, SG_IO, {'S', SG_DXFER_NONE, cmd[16]=[85, 06, 20, 00, da, 00, 00, 00, 00, 00, 4f, 00, c2, 00, b0, 00], mx_sb_len=32, iovec_count=0, dxfer_len=0, timeout=2000, flags=0, status=02, masked_status=01, sb[22]=[72, 00, 00, 00, 00, 00, 00, 0e, 09, 0c, 00, 00, 00, 00, 00, 00, 00, 4f, 00, c2, 00, 50], host_status=0, driver_status=0x8, resid=0, duration=0, info=0x1}) = 0
evaluation
----------
* With 0.16, skdump --can-smart and {devkit-,u}disks-probe-ata-smart do IDENTIFY_DEVICE and READ_THRESHOLDS → fail
* With GIT HEAD, skdump --can-smart and {devkit-,u}disks-probe-ata-smart do IDENTIFY_DEVICE → works
* With both 0.16 and GIT HEAD, skdump does IDENTIFY_DEVICE, READ_THRESHOLDS, and READ_DATA → works
* smartmontools always does READ_THRESHOLDS and READ_DATA closely together
It seems READ_THRESHOLDS without READ_DATA causes this problem, the
drive "wants" to send more data which is never flushed; possible
explanation: https://bugzilla.kernel.org/show_bug.cgi?id=14583#c25
bisecting
---------
only three commits between 0.16 and git head which could change/fix this:
http://git.0pointer.de/?p=libatasmart.git;a=commitdiff;h=a223a4f6277a9f006b722b13671d5292dc6339bb
http://git.0pointer.de/?p=libatasmart.git;a=commitdiff;h=cfe49b30af32b5b631b2a055c6d10197a70d90ff
http://git.0pointer.de/?p=libatasmart.git;a=commitdiff;h=51502143eeb0a5553ab5977d07bf707dac47200c
a223a4: BINGO, this drops the READ_THRESHOLDS call
Quite obviously from the commit, sk_disk_open() called disk_smart_read_thresholds(), but not sk_disk_smart_read_data().
udisks-probe-ata-smart and skdump --can-smart just call sk_disk_open() and sk_disk_smart_is_available() (the latter does not do any I/O itself, just tests a flag).
So while a223a4 fixes this for the "common" use cases, there might still
be situations where thresholds are read, but not the values. Let's look
where init_smart() (the only place reading thresholds) is called:
* sk_disk_smart_read_data(): OK, does READ_DATA
* sk_disk_smart_status(): Only does SK_SMART_COMMAND_RETURN_STATUS,
potentially affected
* sk_disk_smart_self_test(): OK, calls sk_disk_smart_read_data()
Potential robust solution: Make init_smart() call
sk_disk_smart_read_data().
Testing sk_disk_smart_status()
------------------------------
Calling only sk_disk_open() and sk_disk_smart_status():
strace -e ioctl ./skdump --status /dev/sda
ioctl(3, BLKGETSIZE64, 0x904f014) = 0
ioctl(3, SG_IO, {'S', SG_DXFER_FROM_DEV, cmd[16]=[85, 08, 2e, 00, 00, 00, 01, 00, 00, 00, 00, 00, 00, 00, ec, 00], mx_sb_len=32, iovec_count=0, dxfer_len=512, timeout=2000, flags=0, data[512]=["J\4\212\36\0\0\20\0\0~\0\2?\0x\0`?\0\0 "...], status=02, masked_status=01, sb[22]=[72, 00, 00, 00, 00, 00, 00, 0e, 09, 0c, 00, 00, 00, ff, 00, 00, 00, 00, 00, 00, 00, 50], host_status=0, driver_status=0x8, resid=0, duration=4, info=0x1}) = 0
ioctl(3, SG_IO, {'S', SG_DXFER_FROM_DEV, cmd[16]=[85, 08, 2e, 00, d1, 00, 01, 00, 00, 00, 4f, 00, c2, 00, b0, 00], mx_sb_len=32, iovec_count=0, dxfer_len=512, timeout=2000, flags=0, data[512]=["\20\0\350\2\0\0\0\0\0\0\0\0\0\0\351\0\0\0\0\0\0\0\0\0\0\0\352\0\0\0\0\0"...], status=02, masked_status=01, sb[22]=[72, 00, 00, 00, 00, 00, 00, 0e, 09, 0c, 00, 00, 00, 00, 00, 00, 00, 4f, 00, c2, 00, 50], host_status=0, driver_status=0x8, resid=0, duration=0, info=0x1}) = 0
ioctl(3, SG_IO, {'S', SG_DXFER_NONE, cmd[16]=[85, 06, 20, 00, da, 00, 00, 00, 00, 00, 4f, 00, c2, 00, b0, 00], mx_sb_len=32, iovec_count=0, dxfer_len=0, timeout=2000, flags=0, status=02, masked_status=01, sb[22]=[72, 00, 00, 00, 00, 00, 00, 0e, 09, 0c, 00, 00, 00, 00, 00, 00, 00, 4f, 00, c2, 00, 50], host_status=0, driver_status=0x8, resid=0, duration=0, info=0x1}) = 0
So this does READ_THRESHOLDS without READ_DATA from disk_open(), an
additional RETURN_STATUS, which also seems to work.
--
devkit-disks-probe-ata-smart causes HSM Violations on SSD, and potential hardware death
https://bugs.launchpad.net/bugs/445852
You received this bug notification because you are a member of Kernel
Bugs, which is subscribed to Linux.
More information about the kernel-bugs
mailing list