Commit 48771c04 authored by Swann Perarnau's avatar Swann Perarnau

Import coolr files into project, link to sensor

This a partial import of github.com/coolr-hpc/pycoolr from
master branch, commit id: 67e7aa4b89b67744922b5926cd1459adf650013b

Coolr will provide us the capability to read power, topology and
msr-based sensors.

This patch links the sensor module with the coolr code as it is, which
doesn't really work. The core issue is that the current coolr code is
meant to be stand-alone, and the main functions to both sampling and
formatting of the data in json, by hand.

We will solve that in the next commit, removing the json-specific code
from the sample function to create a dict of values instead.
parent c0b82b60
#!/usr/bin/env python
#
# coolr cpufreq related codes
#
# There is no perfect way to read the CPU clock on x86. We need to
# read TSC, APERF and MPERF to estimate the current cpu clock on x86.
#
# This code requires the cpustat driver
#
# Contact: Kazutomo Yoshii <ky@anl.gov>
#
import os, sys, time
import struct
import copy
import clr_nodeinfo
import numpy as np
#an example the content of cpustat
#id 0
#aperf 4926926121023
#mperf 4582847073452
#perf_bias 8
#ucc 281462841145699
#urc 0
#perf_target 8448
#perf_status 8448
#pstate 33
#turbo_disengage 0
#tsc 1117245755950154
class cpustatvals:
def cpustatfn(self, cpuid):
return "/sys/devices/system/cpu/cpu%d/cpustat/cpustat" % cpuid
def __init__(self, cpuid):
self.u64max = (1 << 64) - 1
self.d = {}
self.cpuid = cpuid
def parse(self):
self.d = {} # clear d's contents
self.d['time'] = time.time()
with open(self.cpustatfn(self.cpuid)) as f:
while True:
l = f.readline()
if not l:
break
a = l.split()
if a[0] in ('id', 'aperf', 'mperf', 'pstate', 'tsc'):
self.d[a[0]] = int(a[1])
def pr(self):
for k in ('id', 'aperf', 'mperf', 'pstate', 'tsc'):
print '%s=%d ' % (k, self.d[k]),
print
def diff_u64(self, v1, v2): # v1 - v2
if v1 >= v2:
return v1 - v2
return (self.u64max -v2) + v1
def calc_cpufreq(self,prev): # prev is an object of cpustatvals
if not (prev.d.has_key('tsc') and self.d.has_key('tsc')):
return 0.0
tmp = {}
for k in ('tsc', 'aperf', 'mperf'):
tmp[k] = float(self.diff_u64(self.d[k], prev.d[k]))
dt = self.d['time'] - prev.d['time']
freq = tmp['aperf'] / tmp['mperf']
freq *= tmp['tsc']
freq *= 1e-9 # covert it to GHz
freq /= dt
return freq
def calc_aperf(self,prev): # prev is an object of cpustatvals
if not (prev.d.has_key('tsc') and self.d.has_key('tsc')):
return 0.0
tmp = {}
k = 'aperf'
tmp[k] = float(self.diff_u64(self.d[k], prev.d[k]))
dt = self.d['time'] - prev.d['time']
return tmp['aperf'] * 1e-9 / dt
class cpufreq_reader:
def __init__(self):
self.outputpercore(True)
# I don't know how to create an object in a singleton manner in python
# so simply instantiating an object of cputopology again here.
self.ct = clr_nodeinfo.cputopology()
self.cpus = self.ct.onlinecpus # just for convenience
self.init = False
for cpuid in self.cpus:
tmp = cpustatvals(cpuid) # just for cpustatfn
statpath = tmp.cpustatfn(cpuid)
if not os.path.exists(statpath):
# print 'Not found', statpath
return
self.init = True
self.cnt = 0
self.samples = [
[cpustatvals(i) for i in self.cpus],
[cpustatvals(i) for i in self.cpus] ]
self.sample()
def sample(self):
if not self.init:
return
idx = self.cnt % 2
for cpuid in self.cpus:
self.samples[idx][cpuid].parse()
self.cnt = self.cnt + 1
def pstate(self):
ret = [0.0 for i in self.cpus]
if not self.init:
return ret
if self.cnt == 0:
return ret
idx = 0 # if cnt is an odd number
if self.cnt % 2 == 0:
idx = 1
for cpuid in self.cpus:
ret[cpuid] = self.samples[idx][cpuid].d['pstate']
return ret
def cpufreq(self):
ret = [0.0 for i in self.cpus]
if not self.init:
return ret
if self.cnt < 2:
return ret
idxprev = 0
idxcur = 1
if (self.cnt % 2) == 1:
idxprev = 1
idxcur = 0
for cpuid in self.cpus:
ret[cpuid] = self.samples[idxcur][cpuid].calc_cpufreq(
self.samples[idxprev][cpuid])
return ret
def aperf(self):
ret = [0.0 for i in self.cpus]
if not self.init:
return ret
if self.cnt < 2:
return ret
idxprev = 0
idxcur = 1
if (self.cnt % 2) == 1:
idxprev = 1
idxcur = 0
for cpuid in self.cpus:
ret[cpuid] = self.samples[idxcur][cpuid].calc_aperf(
self.samples[idxprev][cpuid])
return ret
def outputpercore(self,flag=True):
self.percore=flag
def sample_and_json(self, node=""):
if not self.init:
return ''
self.sample()
f = self.aperf()
buf = '{"sample":"freq", "time":%.3f' % time.time()
if len(node) > 0:
buf += ',"node":"%s"' % node
for p in sorted(self.ct.pkgcpus.keys()):
tmp = [f[i] for i in self.ct.pkgcpus[p]]
freqmean = np.mean(tmp)
freqstd = np.std(tmp)
buf += ',"p%s":{' % p
buf += '"mean":%.3lf,"std":%.3lf' % (freqmean,freqstd)
if self.percore:
for c in self.ct.pkgcpus[p]:
buf += ',"c%d":%.3lf' % (c, f[c])
buf += '}'
buf += '}'
return buf
if __name__ == '__main__':
freq = cpufreq_reader()
if not freq.init:
print 'Please check the cpustat module is installed'
sys.exit(1)
freq.outputpercore(False)
for i in range(0, 20):
j = freq.sample_and_json()
print '[freq json]'
print j
time.sleep(1)
sys.exit(0)
for i in range(0, 20):
freq.sample()
print '[pstate]',
for p in freq.pstate():
print p,
print
print '[aperf]',
for f in freq.aperf():
print '%.2lf ' % f,
print
print '[freq]',
for f in freq.cpufreq():
print '%.2lf ' % f,
print
j = freq.sample_and_json()
print '[freq json]'
print j
print
time.sleep(1)
#!/usr/bin/env python
#
# coolr hwmon related codes
#
# This code requires the coretemp driver for temperature reading
#
# Contact: Kazutomo Yoshii <ky@anl.gov>
#
import re, os, sys
import numpy as np
from clr_nodeinfo import *
class coretemp_reader :
def parse_pkgtemp(self,fn):
retval = -1
try:
f = open(fn , "r")
except:
return retval
l = f.readline()
m = re.search('Physical id ([0-9]+)', l )
if m:
retval=int(m.group(1))
f.close()
return retval
def parse_coretemp(self,fn):
retval = -1
try:
f = open(fn , "r")
except:
return retval
l = f.readline()
m = re.search('Core ([0-9]+)', l )
if m:
retval=int(m.group(1))
f.close()
return retval
hwmondir = '/sys/class/hwmon/'
class coretempinfo:
def __init__(self):
self.dir = ''
self.coretempfns = {} # use coreid as key
self.pkgtempfn = ''
def __init__ (self):
self.outputpercore(True)
self.coretemp = {} # use pkgid as key
for d1 in os.listdir(self.hwmondir):
# try to check see if 'name' contains 'coretemp'
tmpdir = "%s%s" % (self.hwmondir,d1)
drivername = readbuf("%s/name" % tmpdir).rstrip()
if not drivername == "coretemp":
continue
pkgid = -1
coretempfns = {}
pkgtempfn = ''
# parse all temp*_label files
for d2 in os.listdir( tmpdir ):
m = re.search( 'temp([0-9]+)_label', d2 )
if m:
tempid=int(m.group(1))
coreid = self.parse_coretemp("%s/%s" % (tmpdir, d2))
if coreid >= 0 :
coretempfns[coreid] = "%s/temp%d_input" % (tmpdir, tempid)
else: # possibly pkgid
pkgtempfn = "%s/temp%d_input" % (tmpdir, tempid)
pkgid = self.parse_pkgtemp("%s/%s" % (tmpdir, d2))
if pkgid < 0 :
print 'unlikely: ', pkgtempfn
cti = self.coretempinfo()
cti.dir = tmpdir
cti.coretempfns = coretempfns
cti.pkgtempfn = pkgtempfn
if pkgid < 0: # assume a single socket machine
self.coretemp[0] = cti
else:
self.coretemp[pkgid] = cti
def readtempall(self):
ctemp = self.coretemp
ret = {}
for pkgid in sorted(ctemp.keys()):
temps = {}
if os.access(ctemp[pkgid].pkgtempfn, os.R_OK):
val = int(readbuf(ctemp[pkgid].pkgtempfn))/1000
temps['pkg'] = val
for c in sorted(ctemp[pkgid].coretempfns.keys()):
if os.access(ctemp[pkgid].coretempfns[c], os.R_OK):
val = int(readbuf(ctemp[pkgid].coretempfns[c]))/1000
temps[c] = val
ret[pkgid] = temps
return ret
def outputpercore(self,flag=True):
self.percore=flag
def sample_and_json(self,node = ""):
temp = self.readtempall()
# constructing a json output
s = '{"sample":"temp","time":%.3f' \
% (time.time())
if len(node) > 0:
s += ',"node":"%s"' % node
for p in sorted(temp.keys()):
s += ',"p%d":{' % p
pstat = self.getpkgstats(temp, p)
s += '"mean":%.2lf ' % pstat[0]
s += ',"std":%.2lf ' % pstat[1]
s += ',"min":%.2lf ' % pstat[2]
s += ',"max":%.2lf ' % pstat[3]
if self.percore:
for c in sorted(temp[p].keys()):
s += ',"%s":%d' % (c, temp[p][c])
s += '}'
s += '}'
return s
def getmaxcoretemp(self, temps):
vals = []
for pkgid in temps.keys():
for c in temps[pkgid].keys():
vals.append(temps[pkgid][c])
return np.max(vals)
def getpkgstats(self, temps, pkgid):
vals = []
for c in temps[pkgid].keys():
vals.append(temps[pkgid][c])
return [np.mean(vals), np.std(vals), np.min(vals), np.max(vals)]
def readpkgtemp(self):
fn = "%s_input" % self.pkgtempfns[pkgid].pkgfn
f = open(fn)
v = int(f.readline())/1000.0
f.close()
return v
def readcoretemp(self,pkgid):
t = []
for fnbase in self.pkgtempfns[pkgid].corefns:
fn = "%s_input" % fnbase
if not os.access( fn, os.R_OK ):
continue # cpu may become offline
f = open(fn)
v = int(f.readline())/1000.0
f.close()
t.append(v)
return t
class acpi_power_meter_reader :
# add a nicer detection routine later
def __init__(self):
self.init = False
fn = '/sys/class/hwmon/hwmon0/device/power1_average'
if os.path.exists(fn):
self.init = True
def initialized(self):
return self.init
def read(self):
if not self.init:
return -1
retval=-1
fn = '/sys/class/hwmon/hwmon0/device/power1_average'
try:
f = open(fn , "r")
except:
return retval
l = f.readline()
retval = int(l) * 1e-6 # uW to W
f.close()
return retval
def sample_and_json(self, node=""):
if not self.init:
return ''
pwr = self.read()
buf = '{"sample":"acpi", "time":%.3f' % time.time()
if len(node) > 0:
buf += ',"node":"%s"' % node
buf += ',"power":%.2lf}' % pwr
return buf
if __name__ == '__main__':
acpipwr = acpi_power_meter_reader()
if acpipwr.initialized():
print acpipwr.sample_and_json('testnode')
ctr = coretemp_reader()
ctr.outputpercore(False)
temp = ctr.readtempall()
for p in sorted(temp.keys()):
print 'pkg%d:' % p,
for c in sorted(temp[p].keys()):
print "%s=%d " % (c, temp[p][c]),
print
for i in range(0,3):
s = ctr.sample_and_json()
print s
time.sleep(1)
# measure the time to read all temp
# note: reading temp on other core triggers an IPI,
# so reading temp frequency will icreate the CPU load
print 'Measuring readtime() and getmaxcoretemp ...'
for i in range(0,3):
a = time.time()
temp = ctr.readtempall()
maxcoretemp = ctr.getmaxcoretemp(temp)
b = time.time()
print ' %.1f msec, maxcoretemp=%d' % ((b-a)*1000.0, maxcoretemp),
for p in sorted(temp.keys()):
s = ctr.getpkgstats(temp, p)
print ' mean=%.2lf std=%.2lf min=%.1lf max=%.1lf' % \
(s[0], s[1], s[2], s[3]),
print
time.sleep(1)
print
#!/usr/bin/env python
#
# misc. classes, functions
#
# Contact: Kazutomo Yoshii <ky@anl.gov>
#
import os, sys, re, time
def readbuf(fn):
for retry in range(0,10):
try:
f = open( fn )
l = f.readline()
f.close()
return l
except:
time.sleep(0.01)
continue
return ''
def readuptime():
f = open( '/proc/uptime' )
l = f.readline()
v = l.split()
return float( v[0] )
#!/usr/bin/env python
#
# CPU related codes
#
# Contact: Kazutomo Yoshii <ky@anl.gov>
#
import os, sys, re, time, socket
# local
from clr_misc import *
#
# Once instantiated, the following values are avaialble
#
# onlinecpus : a list holds all online cpus
# pkgcpus : a dict holds per pkg cpus. the key of the dict are pkgids
# nodecpus : a dict holds per node cpus. the key of the dict are nodeids
#
# limitation: no support for runtime change
#
class cputopology:
cpubasedir = '/sys/devices/system/cpu/'
nodebasedir = '/sys/devices/system/node/'
def parserange(self,fn):
tmp = readbuf( fn )
ret = []
for t in tmp.split(','):
ab = re.findall('[0-9]+', t)
if len(ab) == 2 :
ret = ret + range( int(ab[0]), int(ab[1])+1 )
elif len(ab) == 1:
ret = ret + [int(ab[0])]
else:
print 'unlikely at cputoplogy.parserange():',ab
sys.exit(1)
return ret
def parsemask(self,fn):
tmp = readbuf( fn )
tmp = tmp.rstrip()
maskstrs = tmp.split(',')
maskstrs.reverse()
shift=0
ret = []
for mstr in maskstrs:
bmint = long(mstr,16)
for i in range(0,32):
if (bmint&1) == 1:
ret.append(i+shift)
bmint = bmint >> 1
shift = shift + 32
return ret
def detect(self):
self.onlinecpus = self.parserange(self.cpubasedir + 'online')
self.pkgcpus = {}
for cpuid in self.onlinecpus:
pkgidfn = self.cpubasedir + "cpu%d/topology/physical_package_id" % (cpuid)
pkgid = int(readbuf(pkgidfn))
if not self.pkgcpus.has_key(pkgid) :
self.pkgcpus[pkgid] = []
self.pkgcpus[pkgid].append(cpuid)
self.cpu2coreid = {}
self.core2cpuid = {}
for pkgid in self.pkgcpus.keys() :
for cpuid in self.pkgcpus[pkgid]:
coreidfn = self.cpubasedir + "cpu%d/topology/core_id" % (cpuid)
coreid = int(readbuf(coreidfn))
self.cpu2coreid[cpuid] = (pkgid, coreid)
self.core2cpuid[(pkgid, coreid)] = cpuid
self.onlinenodes = self.parserange(self.nodebasedir + 'online')
self.nodecpus = {}
for n in self.onlinenodes:
self.nodecpus[n] = self.parsemask(self.nodebasedir + "node%d/cpumap" % (n))
def __init__(self):
self.detect()
class nodeconfig :
def parse(self):
self.hostname = socket.gethostname()
# XXX: not sure this is unique
self.nodename = self.hostname.split('.')[0]
tmp = readbuf( '/proc/version' )
self.version = tmp.split()[2]
re_model = re.compile("^model\s+:\s+([0-9]+)")
self.cpumodel = -1
with open('/proc/cpuinfo') as f:
while True:
l = f.readline()
if not l:
break
m = re_model.match(l)
if m:
self.cpumodel = int(m.group(1))
self.memoryKB = -1
with open('/proc/meminfo') as f:
l = f.readline()
self.memoryKB = int(l.split()[1])
# assume that all cpu have the same setting for this experiment
self.driver = ''
self.freqdriver = ''
d = '/sys/devices/system/cpu/cpu0/cpufreq'
if os.path.exists(d):
self.freqdriver = 'acpi_cpufreq'
fn = d + "/scaling_driver"
self.driver = readbuf( fn ).rstrip()
fn = d + "/scaling_governor"
self.governor = readbuf( fn ).rstrip()
fn = d + "/scaling_cur_freq"
self.cur_freq = readbuf( fn ).rstrip()
d = "/sys/devices/system/cpu/intel_pstate"
if os.path.exists(d):
self.freqdriver = 'pstate'
k = 'max_perf_pct'
pmax = readbuf( "%s/%s" % (d,k) ).rstrip()
k = 'min_perf_pct'
pmin = readbuf( "%s/%s" % (d,k) ).rstrip()
k = 'no_turbo'
noturbo = readbuf( "%s/%s" % (d,k) ).rstrip()
self.pstate = "%s/%s/%s" % (pmax,pmin,noturbo)
d = "/sys/devices/system/cpu/turbofreq"
if os.path.exists(d):
self.freqdriver = 'coolrfreq'
self.policy = d + '/pstate_policy'
def __init__ (self):
self.parse()
def testnodeconfig():
print '=== ', sys._getframe().f_code.co_name
nc = nodeconfig()
print 'node: ', nc.nodename
print 'version: ', nc.version
print 'cpumodel: ', nc.cpumodel
print 'memoryKB: ', nc.memoryKB
print 'freqdriver: ', nc.freqdriver
print
def testcputopology():
print '=== ', sys._getframe().f_code.co_name
ct = cputopology()
print
print 'No. of online cpus: ', len(ct.onlinecpus)
print
for p in sorted(ct.pkgcpus.keys()):
print 'pkg%d:' % p, len(ct.pkgcpus[p]), ct.pkgcpus[p]
print ' cpuid:',
for cpuid in ct.pkgcpus[p]:
print ct.cpu2coreid[cpuid],ct.cpu2coreid[cpuid][1],
print
print
for n in sorted(ct.nodecpus.keys()):
print 'node%d:' % n, len(ct.nodecpus[n]), ct.nodecpus[n]
print ' cpuid:',
for cpuid in ct.nodecpus[n]:
print ct.cpu2coreid[cpuid],
print
print
if __name__ == '__main__':
testnodeconfig()
testcputopology()
#!/usr/bin/env python
#