#!/usr/bin/python
#
# A version of iostat that actually reports real numbers taken straight
# from the kernel. This uses the 2.4 /proc/partitions stuff.
#
# Usage: xiostat.py [-q] [-c COUNT] [DEV [DELAY]]
#
# -q: don't print the periodic header.
# -c COUNT: exit after printing COUNT reports.
# DEV is the device to report stats on; it defaults to 'sde1'.
# DELAY is the amount of time to delay between report lines, in
# seconds; it defaults to one (1) second. You can use fractional
# seconds if you want to.
#
# All statistics are per-second ones, regardless of the delay, except for
# 'act' which is an instantaneous number (how many IOs are in flight RIGHT
# NOW, as the program reads the data).
# 'rwait', 'wwait', and 'await' are in milliseconds. 'util' is a percentage.
# 'rsect' and 'wsect' are 512-byte units. (We could print rkB and wkB, but
# the author wants this to be as close to raw as possible.)

# This avoids all sorts of irritating 'float(...)/...' usage and lets
# us just divide straight.
from __future__ import division

import sys, time, getopt, os.path
import re

# This is not a user-servicable part. It controls whether we barf up
# messages when delta-computed fields go negative.
debug = 0

class IOEx(Exception):
	pass

# Resolve a filesystem mount point into a device name suitable for
# finding in /proc/partitions et al.
# Note that in /proc/mounts, loopback mounts show as being mounted
# on the real device. It's only in 'mount' that they come up funny.
def find_device(fsname):
	fp = open("/proc/mounts", "r")
	for ln in fp:
		flds = ln.strip().split()
		if flds[1] != fsname:
			continue
		if flds[2] not in ('ext2', 'ext3', 'vfat', ):
			raise IOEx, "cannot deal with filesystem type %s for %s" % (flds[2], fsname)
		return flds[0]
	raise IOEx, "could not find /proc/mounts entry for filesystem %s" % fsname
lvre = re.compile("\[[A-Z ]+\] ([^\s]+)\s+\d+ /\d+ ")
def resolve_lvm(devname):
	# Split LVM name into VG and LV.
	ab = devname.split("/")
	fp = open("/proc/lvm/global", "r")
	curvg = ""; physdev = ""
	inlv = 0
	for ln in fp:
		ln = ln.strip()
		if ln.startswith("VG: "):
			curvg = ln.split()[1]
			inlv = 0
		elif ln.startswith("PV: "):
			physdev = ln.split()[2]
		elif ln.startswith("LVs: ") or ln.startswith("LV: "):
			inlv = 1
			ln = ln.split(None, 1)[1]
		if inlv:
			mo = lvre.match(ln)
			if not mo:
				inlv = 0
				continue
			if mo.group(1) == ab[1] and curvg == ab[0]:
				return physdev
	raise IOEx, "Cannot find the physical volume for LVM volume /dev/%s" % devname	
def resolve_fsname(fsname):
	dn = find_device(fsname)
	if dn.startswith("/dev/md"):
		raise IOEx, "filesystem %s is on MD device %s, we cannot deal with those" % (fsname, dn)
	elif not dn.startswith("/dev/"):
		raise IOEx, "filesystem %s is on %s, which I cannot deal with" % (fsname, dn)
	# zap off the /dev.
	dn = dn[len("/dev/"):]
	# Okay, are we on an LVM volume? (This is an ugly heuristic.)
	if not '/' in dn:
		return dn
	return resolve_lvm(dn)

# Subtract two unsigned 32-bit integers in the presence of rollover.
# The best we can do is assume that the rollover only happens once.
#
# While the theory that negative number freakouts are caused by
# (unsigned) integer rollovers is a nice one and attractive and all
# that, it turns out to be completely wrong. While they are possible,
# dumping the numbers involved makes it clear that they are not the
# actual cause; the 'b' number is only around 2**25 to 2**26 or so.
# At the moment, the author is inclined to blame (lack of) locking on
# the gendisk stats, since the stat update caused by reading them from
# /proc/partitions can likely race with stat updates from interrupt
# level.
#
# Still, we preserve this code for posterity just in case.
maxU = 2**32-1
def uintSub(a, b):
	# First: some of these numbers are not actually merely
	# unsigned integers; some of them are long longs. Cope
	# with this case. (I *think* this doesn't happen, but
	# better safe than sorry.)
	if a > maxU or b > maxU or a >= b:
		return a - b
	res = (a - b) % maxU
	#print "uintSub:", a, b, res
	return res

# This stores the IO statistics (for a single device object) harvested
# from /proc/partitions. We define subtraction on iostat objects as a
# delta operation, which means we leave 'running' intact.
class iostats:
	fields = ('rio', 'rmerge', 'rsect', 'ruse', 'wio', 'wmerge', 'wsect',
		  'wuse', 'running', 'use', 'aveq', 'rduse', 'wduse',
		  'rtime', 'wtime', 'raveq', 'waveq', )
	def __init__(self, lst = None):
		if not lst:
			for f in self.fields:
				setattr(self, f, 0)
		else:
			for x in xrange(0, min(len(self.fields), len(lst))):
				setattr(self, self.fields[x], lst[x])
			# None is the special 'not supported' value.
			for x in xrange(len(lst), len(self.fields)):
				setattr(self, self.fields[x], None)
	def __sub__(self, other):
		n = iostats()
		for f in self.fields:
			# 'running' is an instantaneous number, so we just
			# copy it, not subtract it.
			if f == 'running':
				setattr(n, f, getattr(self, f))
			# propagate 'not supported' magically.
			elif getattr(self, f) is None:
				setattr(n, f, None)
			else:
				#res = uintSub(getattr(self, f),
				#	      getattr(other, f))
				res = getattr(self, f) - getattr(other, f)
				if res < 0:
					if debug:
						sys.stderr.write("xiostat: %s freakout: %d - %d = %d\n" % (f, getattr(self, f), getattr(other, f), res))
					res = -1
				setattr(n, f, res)
		return n
	def __str__(self):
		return "<iostat: %s>" % ", ".join(["%s: %d" % (f, getattr(self, f)) for f in self.fields])

# Read /proc/uptimes.
# We return the uptime in seconds (the first field). Note that this can
# have decimal digits, so we must return it as a floating point number.
def getuptime(fn):
	fp = open(fn, "r")
	l = fp.readline()
	fp.close()
	return float(l.split()[0])

# Read /proc/partitions, returning an iostat object.
# This will alternately read /proc/diskstats, returning the same.
# We tell the difference based on black magic.
def getdiskstat(fn, dev):
	fp = open(fn, "r")
	devAt = None; lineLen = None
	for l in fp:
		n = l.strip().split()
		# first line: determine format.
		if devAt is None:
			if n[0] == "major":
				devAt = 3; lineLen = 15
				continue
			else:
				devAt = 2; lineLen = 14
		if len(n) < lineLen or n[devAt] != dev:
			continue
		return iostats([long(x) for x in n[devAt+1:]])
	fp.close()
	return None

# Return a decimalized string that will fit into a field that is fw wide.
# We never use more than two decimal places, but we can use one or
# zero depending on the size of 'num'.
# Life is complicated by our need to cope with the rounding that goes
# on with floating point numbers during printing.
def decimalize(num, fw):
	maxtwo = 10 ** (fw-3)
	maxone = 10 ** (fw-2)
	# Sometimes formatting a floating point number for printing with
	# a limited number of digits can make it larger, causing us to
	# potentially overflow a field width. We use round() beforehand
	# here to cope with that; otherwise we would get periodic results
	# of '100.0' for a four-digit field, for example, when we get
	# handed '99.999' as the number to print.
	if round(num, 1) >= maxone or long(num) == num:
		# '%d' apparently truncates (?!); oh well, cope.
		return ("%%%dd" % fw) % round(num, 0)
	elif round(num, 2) >= maxtwo:
		return ("%%%d.1f" % fw) % num
	else:
		return ("%%%d.2f" % fw) % num

# This calculates the numeric value from the field name, the IO delta
# structure, and the time delta. (The time delta is in seconds.)  Some
# fields are more or less straight from the iod (sometimes recomputed
# to per-second values); others are calculated from multiple others.
persecs = ('rio', 'rmerge', 'rsect', 'wio', 'wmerge', 'wsect',)
def calcval(iod, td, field):
	# if b is nonzero, return a/b; otherwise return 0.
	def zerodiv(a, b):
		# In rare cases (iod.use especially) the denominator can
		# be negative instead of the numerator.
		if a is None:	return None
		elif a < 0:	return a
		elif b < 0:	return b
		if b:	return a / b
		else:	return 0
	if field == 'act':		return iod.running
	elif field == 'rwait':		return zerodiv(iod.ruse, iod.rio)
	elif field == 'wwait':		return zerodiv(iod.wuse, iod.wio)
	elif field == 'await':
		# await is invalid if either of the components is invalid,
		# so we must ripple this invalidity through.
		if iod.ruse == -1 or iod.wuse == -1:
			return -1
		return zerodiv(iod.ruse + iod.wuse, iod.rio + iod.wio)
	elif field == 'rdwait':		return zerodiv(iod.rduse, iod.rio)
	elif field == 'wdwait':		return zerodiv(iod.wduse, iod.wio)
	elif field == 'adwait':
		if iod.rduse is None or iod.wduse is None:
			return None
		if iod.rduse == -1 or iod.wduse == -1:
			return -1
		return zerodiv(iod.rduse + iod.wduse, iod.rio + iod.wio)
	elif field == 'aveq':		return zerodiv(iod.aveq, iod.use)
	elif field == 'raveq':		return zerodiv(iod.raveq, iod.rtime)
	elif field == 'waveq':		return zerodiv(iod.waveq, iod.wtime)
	elif field == 'rgrp':		return zerodiv(iod.rsect, iod.rio)
	elif field == 'wgrp':		return zerodiv(iod.wsect, iod.wio)
	elif field == 'agrp':
		return zerodiv(iod.rsect + iod.wsect, iod.rio + iod.wio)
	# 'util' is a percentage, and 'use' is in milliseconds.
	# This makes it work out nicely.
	elif field == 'util':
		# 'use' can apparently go negative too. Who knew? Sigh.
		if iod.use < 0:
			return -1
		return iod.use / (td*10)
	elif field == 'rutil':
		if iod.rtime is None:
			return None
		if iod.rtime < 0:
			return -1
		return iod.rtime / (td*10)
	elif field == 'wutil':
		if iod.wtime is None:
			return None
		if iod.wtime < 0:
			return -1
		return iod.wtime / (td*10)
	elif hasattr(iod, field) and field in persecs:
		return getattr(iod, field) / td
	elif hasattr(iod, field):
		return getattr(iod, field)
	else:
		raise IOEx, "don't know how to compute %s" % field

# The field width of each field.
fwidth = {'act': 4, 'rio': 6, 'wio': 6, 'rmerge': 6, 'wmerge': 6,
	  'rsect': 6, 'wsect': 6, 'rwait': 5, 'wwait': 5, 'await': 5,
	  'rgrp': 5, 'wgrp': 5, 'agrp': 5,
	  'rdwait': 6, 'wdwait': 6, 'adwait': 6,
	  'raveq': 5, 'waveq': 5, 'rutil': 5, 'wutil': 5,
	  'aveq': 4, 'util': 4}
# The order in which fields are displayed in the output.
forder = ('act',
	  'rio', 'rmerge', 'rsect', 'rwait', 'rgrp',
	  'wio', 'wmerge', 'wsect', 'wwait', 'wgrp',
	  'agrp', 'aveq', 'await', 'util',
	  'rdwait', 'wdwait', 'adwait', 'rutil', 'wutil', 'raveq', 'waveq', )

# Display an iostat delta, using the given time delta to convert the
# relevant numbers to a per-second count. The time delta is in seconds.
def display(iod, td):
	outl = []
	for fn in forder:
		r = calcval(iod, td, fn)
		if r is None:
			continue
		outl.append(decimalize(r, fwidth[fn]))
	print " ".join(outl)
	# And flush our output if we are writing to a file or a pipe for
	# logging purposes:
	sys.stdout.flush()
# Print the report header.
def header(iod, td):
	outl = []
	for fn in forder:
		r = calcval(iod, td, fn)
		if r is None:
			continue
		outl.append(("%%%ds" % fwidth[fn]) % fn)
	print " ".join(outl)
	
# Our main processing loop proceeds by getting the initial stats,
# then looping around sleeping the delay time, getting new stats,
# computing the delta, and finally displaying them.
def statloop(every, dev, showheader = 1, max = 0):
	oldUt = getuptime("/proc/uptime")
	if os.path.exists("/proc/diskstats"):
		statFile = "/proc/diskstats"
	else:
		statFile = "/proc/partitions"
	oldSt = getdiskstat(statFile, dev)
	if not oldSt:
		raise IOEx, "cannot get starting stats for %s" % dev
	# We produce the header immediately as a reassurance that we
	# are actually doing something, since it could be some time
	# before the first display (since it comes after an 'every'
	# interval).
	if showheader:
		header(oldSt, 1)
	lc = 1
	while (max == 0) or lc < max:
		time.sleep(every)
		newUt = getuptime("/proc/uptime")
		newSt = getdiskstat(statFile, dev)
		if not newSt:
			raise IOEx, "cannot get new stats for %s" % dev
		td = newUt - oldUt
		iosd = newSt - oldSt
		# If there is no time delta, we punt.
		# We compute the time delta explicitly (from /proc/uptime's
		# uptime time) because on a loaded system we may have slept
		# for (much) longer than 'every' seconds.
		if td == 0:
			continue
		if showheader and lc % 22 == 0:
			header(iosd, td)
		display(iosd, td)
		oldSt = newSt
		oldUt = newUt
		lc += 1

def error(msg):
	sys.stderr.write("%s: %s\n" % (sys.argv[0], msg))
def die(msg):
	error(msg)
	sys.exit(1)
def usage():
	sys.stderr.write("usage: xiostat [-v] [-c COUNT] [-q] [DEV [DELAY]]\n")
	sys.stderr.write("\tDEV defaults to sde1, DELAY to 1 second\n")
	sys.exit(1)
def process(args):
	repmax = 0; every = 1; dev = "sde1"; showheader = 1
	verbose = 0
	try:
		opt, arg = getopt.getopt(args, 'qc:v', [])
	except getopt.GetoptError, e:
		error(str(e))
		usage()
	for o, a in opt:
		if o == '-q':
			showheader = 0
		elif o == '-v':
			verbose += 1
		elif o == '-c':
			try:
				repmax = int(a)
			except ValueError:
				die("-c argument '%s' is not an integer" % a)
		else:
			die("Chris failed to handle switch '%s'" % o)
	if len(arg) > 2:
		error("Extra arguments supplied.")
		usage()
	if len(arg) == 2:
		try:
			every = float(arg[1])
		except ValueError:
			error("Cannot turn delay '%s' into a number" % arg[1])
			usage()
	if len(arg) > 0:
		dev = arg[0]
		if dev[0] == "/":
			dev = resolve_fsname(dev)
			if verbose:
				print "Displaying statistics for", dev
	try:
		statloop(every, dev, showheader, repmax)
		sys.exit(0)
	except IOEx, e:
		error(str(e))
		sys.exit(1)
	except EnvironmentError, e:
		error("System error: %s" % str(e))
		sys.exit(1)

if __name__ == "__main__":
	process(sys.argv[1:])
