Commit e4a20730 authored by Shane Snyder's avatar Shane Snyder
Browse files

forgot to add 3 files to darshan-test/2.x

parent 919fe4a9
#!/usr/bin/perl -w
#
# (C) 2010 by Argonne National Laboratory.
#
# Portions of this code including histogram package and routines for
# printing human readable sizes and percentages taken from fsstats 1.4.5
# Copyright (c) 2005 Panasas, Inc.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
use Cwd;
use Getopt::Long;
use English;
# files to process
my @csv_files = ();
process_args();
my $special_files = 0;
my $skipped_hlink = 0;
my $skipped_snapshot = 0;
my $total_cap_used = 0;
my $total_size = 0;
my $slink_relative = 0;
my $slink_absolute = 0;
my $slink_relative_pct = 0;
my $slink_absolute_pct = 0;
my ($size_histo,
$cap_histo,
$pos_ovhd_histo,
$neg_ovhd_histo,
$dir_histo,
$dirkb_histo,
$fname_histo,
$slink_histo,
$hlink_histo,
$mtime_files_histo,
$mtime_bytes_histo,
$ctime_files_histo,
$ctime_bytes_histo,
$atime_files_histo,
$atime_bytes_histo);
# initialize new histogram data structures (copied from fsstats program)
$size_histo = Histo->new(min => 0, incr => 1, log_incr => 1, integer_vals => 0);
$cap_histo = Histo->new(min => 0, incr => 1, log_incr => 1, integer_vals => 0);
$pos_ovhd_histo = Histo->new(min => 0, incr => 1, log_incr => 1, integer_vals => 0);
$neg_ovhd_histo = Histo->new(min => 0, incr => 1, log_incr => 1, integer_vals => 0);
$dir_histo = Histo->new(min => 0, incr => 1, log_incr => 1);
$dirkb_histo = Histo->new(min => 0, incr => 1, log_incr => 1, integer_vals => 0);
#Be careful in choosing the value of 'max' for histos.
#If the log_incr is set for them, the largest value+1 in the last bucket will be a power of 2.
#If it is not set the largest value+1 in the last bucket will be some multiple of the 'max' value.
#To be able to toggle log_incr off and on without having to change anything else 'max' should be
#chosen carefully. Otherwise results may look wrong.
#Hence if the max value chosen is 'n', then n+1 should be a power of 2. And n+1 should also be a
#multiple of the 'incr' value.
#Dont play with min values.
# $fname_histo = Histo->new(min => 0, max => 120, incr => 8);
$fname_histo = Histo->new(min => 0, max => 127, incr => 8);
# $slink_histo = Histo->new(min => 0, max => 120, incr => 8);
$slink_histo = Histo->new(min => 0, max => 127, incr => 8);
$hlink_histo = Histo->new(min => 0, incr => 1, log_incr => 1);
$mtime_files_histo = Histo->new(min => 0, incr => 1, log_incr => 1, integer_vals => 0);
$mtime_bytes_histo = Histo->new(min => 0, incr => 1, log_incr => 1, integer_vals => 0);
$ctime_files_histo = Histo->new(min => 0, incr => 1, log_incr => 1, integer_vals => 0);
$ctime_bytes_histo = Histo->new(min => 0, incr => 1, log_incr => 1, integer_vals => 0);
$atime_files_histo = Histo->new(min => 0, incr => 1, log_incr => 1, integer_vals => 0);
$atime_bytes_histo = Histo->new(min => 0, incr => 1, log_incr => 1, integer_vals => 0);
# loop through specified csv files
foreach my $file ( @csv_files )
{
open(CSV, "$file") || die("Error opening file $file: $!\n");
add_summary(CSV);
add_histogram(CSV, $size_histo, "file size");
add_histogram(CSV, $cap_histo, "capacity used");
add_histogram(CSV, $pos_ovhd_histo, "positive overhead");
add_histogram(CSV, $neg_ovhd_histo, "negative overhead");
add_histogram(CSV, $dir_histo, "directory size (entries)");
add_histogram(CSV, $dirkb_histo, "directory size");
add_histogram(CSV, $fname_histo, "filename length");
add_histogram(CSV, $hlink_histo, "link count");
add_histogram(CSV, $slink_histo, "symlink target length");
add_histogram(CSV, $mtime_files_histo, "mtime (files)");
add_histogram(CSV, $mtime_bytes_histo, "mtime (KB)");
add_histogram(CSV, $ctime_files_histo, "ctime (files)");
add_histogram(CSV, $ctime_bytes_histo, "ctime (KB)");
add_histogram(CSV, $atime_files_histo, "atime (files)");
add_histogram(CSV, $atime_bytes_histo, "atime (KB)");
close(CSV);
}
print_output();
exit 0;
sub print_output
{
open($fh, ">-") || die("Error opening output.\n");
printf($fh "#Generated by fsstats-merge.pl (fsstats v1.4.5)\n");
printf($fh "#Comment: This is a comment line that can be modified or repeated before\n");
printf($fh "#uploading to record voluntarily added information.\n\n");
printf($fh "skipped special files,%d\n", $special_files);
printf($fh "skipped duplicate hardlinks,%d\n", $skipped_hlink);
printf($fh "skipped snapshot dirs,%d\n", $skipped_snapshot);
printf($fh "total capacity used,%s\n", kb_to_print($total_cap_used));
printf($fh "total user data,%s\n", kb_to_print($total_size));
printf($fh "percent overhead,%f\n", ovhd_pct($total_size, $total_cap_used)/100);
printf($fh "\n");
$size_histo->print_csv($fh, "file size", "KB");
$cap_histo->print_csv($fh, "capacity used", "KB");
$pos_ovhd_histo->print_csv($fh, "positive overhead", "KB");
$neg_ovhd_histo->print_csv($fh, "negative overhead", "KB");
$dir_histo->print_csv($fh, "directory size (entries)", "ents");
$dirkb_histo->print_csv($fh, "directory size", "KB");
$fname_histo->print_csv($fh, "filename length", "chars");
$hlink_histo->print_csv($fh, "link count", "links");
$slink_histo->print_csv($fh, "symlink target length", "chars");
printf($fh "relative symlink target pct,%f\n" .
"absolute symlink target pct,%f\n",
$slink_relative ? $slink_relative / $slink_histo->{count} : 0,
$slink_absolute ? $slink_absolute / $slink_histo->{count} : 0);
$mtime_files_histo->print_csv($fh, "mtime (files)", "days");
$mtime_bytes_histo->print_csv($fh, "mtime (KB)", "days");
$ctime_files_histo->print_csv($fh, "ctime (files)", "days");
$ctime_bytes_histo->print_csv($fh, "ctime (KB)", "days");
$atime_files_histo->print_csv($fh, "atime (files)", "days");
$atime_bytes_histo->print_csv($fh, "atime (KB)", "days");
}
sub process_args
{
use vars qw( $opt_help );
Getopt::Long::Configure("no_ignore_case", "bundling");
GetOptions( "help" );
if($opt_help)
{
print_help();
exit(0);
}
# there should be at least two remaining arguments (file names)
if($#ARGV < 1)
{
print "Error: invalid arguments.\n";
print_help();
exit(1);
}
@csv_files = @ARGV;
return;
}
sub print_help
{
print <<EOF;
Usage: $PROGRAM_NAME file file [file ...]
--help Prints this help message
Purpose:
This script reads multiple csv files generated by fsstats 1.4.5 and
merges the statistics into a single csv file which is printed to stdout.
EOF
return;
}
# add_summary()
# adds summary fields from the specified file handle
#
sub add_summary
{
my ($file) = @_;
my $line = "";
my $version = "";
seek($file, 0, 0);
while($line = <$file>)
{
if ($line =~ /Generated by.*fsstats v([\d\.]+)/) { $version = $1; }
if ($line =~ /^skipped special files,(.*)/) {$special_files += $1;}
if ($line =~ /^skipped duplicate hardlinks,(.*)/) {$skipped_hlink += $1;}
if ($line =~ /^skipped snapshot dirs,(.*)/) {$skipped_snapshot += $1;}
if ($line =~ /^total capacity used,(.*)/) {$total_cap_used += print_to_kb($1);}
if ($line =~ /^total user data,(.*)/) {$total_size += print_to_kb($1);}
# NOTE: deliberately don't accumulate slink percentage; we have to do
# a calculation later so that they are weighted properly. See
# special case in add_histogram() subroutine.
if ($line =~ /^relative symlink target pct,(.*)/) {$slink_relative_pct = $1;}
if ($line =~ /^absolute symlink target pct,(.*)/) {$slink_absolute_pct = $1;}
}
if($version ne "1.4.5")
{
die("Error: csv file not generated by fsstats v1.4.5.\n");
}
seek($file, 0, 0);
}
# add_histogram()
#
# finds the specified histogram in a csv file and sums it to the existing
# histogram data structure
sub add_histogram
{
my $file = $_[0]; # open file handle
# $_[1] is the data structure we are adding to
my $name = quotemeta($_[2]); # name of the histogram
# save some global values; we need to replace/adjust those manually
# after adding in the histogram because we no longer know the actual
# value of each data point
my $old_count = $_[1]->{count};
my $old_total_val = $_[1]->{total_val};
my $old_min_val = $_[1]->{min_val};
my $old_max_val = $_[1]->{max_val};
my $count = 0;
my $average = 0;
my $min = 0;
my $max = 0;
my $total_count = 0;
seek($file, 0, 0);
while($line = <$file>)
{
# look for specified histogram
if($line =~ /^histogram,$name$/)
{
while($line = <$file>)
{
if($line =~ /^count,(.*),/) {$count = $1;}
elsif($line =~ /^average,(.*),/) {$average = $1;}
elsif($line =~ /^min,(.*),/) {$min = $1;}
elsif($line =~ /^max,(.*),/) {$max = $1;}
elsif($line =~ /^bucket min,bucket max,/) {} # key
elsif($line =~
/^([0-9]*\.?[0-9]*),([0-9]*\.?\-?[0-9]*),([0-9]*\.?[0-9]*),([0-9]*\.?[0-9]*),([0-9]*\.?[0-9]*),([0-9]*\.?[0-9]*),([0-9]*\.?[0-9]*),([0-9]*\.?[0-9]*)/)
{
if ($3 > 0)
{
$_[1]->add(($6/$3), $3);
$total_count += $3;
}
}
elsif($line =~ /^\s*$/)
{
# stop when we hit a blank line
last;
}
else
{
print $line;
die("Error: poorly formated csv file.\n");
}
}
last;
}
}
# work backwards to relative and absolute slink counters
if($name eq quotemeta("symlink target length"))
{
$slink_relative += $slink_relative_pct * $total_count;
$slink_absolute += $slink_absolute_pct * $total_count;
}
# fix min and max value
if(defined $old_min_val && $old_min_val < $min)
{$_[1]->{min_val} = $old_min_val;}
else
{$_[1]->{min_val} = $min;}
if(defined $old_max_val && $old_max_val < $max)
{$_[1]->{max_val} = $old_max_val;}
else
{$_[1]->{max_val} = $max;}
seek($file, 0, 0);
}
# convert a printable value to KB
sub print_to_kb {
my ($arg) = @_;
my ($value, $unit) = split(/ /, $arg);
my $num = 0;
if($unit eq "TB")
{
$num = $value * (1024*1024*1024);
}
elsif($unit eq "GB")
{
$num = $value * (1024*1024);
}
elsif($unit eq "MB")
{
$num = $value * (1024);
}
elsif($unit eq "KB")
{
$num = $value;
}
return $num;
}
# The routines in this section were taken directly from fsstats 1.4.5 by
# Marc Unangst <munangst@panasas.com> and Shobhit Dayal
# <sdayal@andrew.cmu.edu>
#######################################################################
# Compute the percent overhead for a given capacity-used and size.
# This method of computing overhead computes "percentage of the
# capacity used that is overhead" and ranges from 0% (no overhead) to
# 100% (size==0 and cap>0, space is all overhead).
sub ovhd_pct {
my ($size, $cap) = @_;
if ($cap == 0) {
return 0;
}
return (($cap - $size)/$cap)*100;
}
# convert a KB value to a "printable" value (GB, MB, or KB) depending
# on its magnitude. returns a string suitable for printing.
sub kb_to_print {
my ($kb) = @_;
my $num;
my $unit;
if($kb > 1024*1024*1024) {
$num = $kb / (1024*1024*1024);
$unit = "TB";
}
elsif($kb > 1024*1024) {
$num = $kb / (1024*1024);
$unit = "GB";
}
elsif($kb > 1024) {
$num = $kb / 1024;
$unit = "MB";
}
else {
$num = $kb;
$unit = "KB";
}
return sprintf("%.2f %s", $num, $unit);
}
##### Histo.pm #####
#
# Histo.pm
#
# Histogram module for Perl.
#
# Author: Marc Unangst <munangst@panasas.com>
#
# Copyright (c) 2005 Panasas, Inc. All rights reserved.
#
use strict;
package Histo;
#
# Constructor for a new Histo object. The arguments are a hash
# of parameter/value pairs. The "min" and "incr" parameters
# must be supplied. "max" and "log_incr" are optional.
#
sub new {
my $type = shift;
my %params = @_;
my $self = {};
die "Histo->new: required parameters not set\n"
unless (defined $params{min} && defined $params{incr});
$self->{min} = $params{min};
# $self->{max} = $params{max}-1 if defined $params{max};
$self->{max} = $params{max} if defined $params{max};
$self->{incr} = $params{incr};
if(defined $params{integer_vals}) {
$self->{integer_vals} = $params{integer_vals};
}
else {
$self->{integer_vals} = 1;
}
$self->{count} = 0;
$self->{total_val} = 0;
if($params{log_incr}) {
$self->{log_incr} = $params{log_incr};
$self->{bucket_max} = [$self->{min}+$self->{log_incr}];
}
else {
$self->{log_incr} = 0;
}
$self->{buckets} = [];
$self->{buckets_val} = [];
bless $self, $type;
}
#
# Add a new data point to the histogram.
#
# @arg $val
# Value to add to the histogram
# @arg $count
# Optional; if specified, the weight of the item being added.
# Calling add($x, 3) is the same as calling add($x) three times.
#
sub add ($$;$) {
my $self = shift;
my ($val, $count) = @_;
if(!defined $count) {
$count = 1;
}
if(!defined $self->{min_val} || $val < $self->{min_val}) {
$self->{min_val} = $val;
}
if(!defined $self->{max_val} || $val > $self->{max_val}) {
$self->{max_val} = $val;
}
# if(int($val) != $val) {
# $self->{integer_vals} = 0;
# }
$self->{count} += $count;
$self->{total_val} += ($val*$count);
#$self->{total_val} += $val;
if(defined $self->{max} && $val > $self->{max}) {
$self->{over_max} += $count;
$self->{over_max_buckets_val} += $val*$count;
}
elsif($val < $self->{min}) {
$self->{under_min} += $count;
$self->{under_min_buckets_val} += $val*$count;
}
else {
my $b;
my $val_to_use = $val;
# NOTE: not applicable in the fsstats-merge.pl script
# if($self == $pos_ovhd_histo || $self == $neg_ovhd_histo) {
# $val_to_use = $size;
# }
if($self->{log_incr}) {
$b = 0;
my $x = $self->{bucket_max}[0];
while($val_to_use >= $x+1) {
$x = $x*2 + 1;
$b++;
if($b > $#{$self->{bucket_max}}) {
$self->{bucket_max}[$b] = $x;
}
}
}
else {
$b = int (($val_to_use - $self->{min}) / $self->{incr});
}
#print STDERR "sample $val into bucket $b\n";
$self->{buckets}[$b] += $count;
$self->{buckets_val}[$b] += $val*$count;
if(!defined $self->{largest_bucket} ||
$self->{buckets}[$b] > $self->{largest_bucket}) {
$self->{largest_bucket} = $self->{buckets}[$b];
}
}
}
#
# Get maximum value of the specified bucket.
#
# @arg $b
# bucket number
#
# @internal
#
sub _get_bucket_max ($$) {
my $self = shift;
my ($b) = @_;
# my $epsilon; dont need this
# if($self->{integer_vals}) {
# $epsilon = 1;
# $epsilon = 0;
# }
# else {
# $epsilon = 0.1;
# }
if($self->{log_incr}) {
if($b <= $#{$self->{bucket_max}}) {
# return ($self->{bucket_max}[$b]-$epsilon);
return ($self->{bucket_max}[$b]);
}
else {
return undef;
}
}
else {
#return ($self->{incr}*($b+1))-$epsilon;
return (($self->{incr}*($b+1)) -1);
}
}
#
# Get minimum value of the specified bucket.
#
# @arg $b
# bucket number
#
# @internal
#
sub _get_bucket_min ($$) {
my $self = shift;
my ($b) = @_;
if($self->{log_incr}) {
if($b == 0) {
return $self->{min};
}
elsif($b <= $#{$self->{bucket_max}}) {
# return $self->{bucket_max}[$b-1]
return $self->{bucket_max}[$b-1]+1;
}
else {
return undef;
}
}
else {
return ($self->{min} + $self->{incr}*($b));
}
}
#
# Print the histogram contents to STDOUT.
#
# @arg $prefix
# String to prefix each output line with.
# @arg $unit_str
# String that describes the units of the histogram items.
#
sub print ($$$) {
my $self = shift;
my ($prefix, $unit_str) = @_;
my $c = 0;
my $d = 0;
# my $prev_pct = 0;
my $width;
my $fmt;
# if ($self->{integer_vals}) {
$width = length sprintf("%d", $self->_get_bucket_max($#{$self->{buckets}}));
$fmt = "d";
# }
# else {
# $width = length sprintf("%.1f", $self->_get_bucket_max($#{$self->{buckets}}));
# $fmt = ".1f"
# }
my $bwidth = 0;
if (defined $self->{largest_bucket}) {
$bwidth = length sprintf("%d", $self->{largest_bucket});
}
if($bwidth < 5) {
$bwidth = 5;
}
my $bwidth_val = length sprintf("%.2f", $self->{total_val});
printf("%scount=%d avg=%.2f %s\n", $prefix,
$self->{count},