#!/usr/bin/perl -w # # (C) 2010 by Argonne National Laboratory. # # Routines for handling human readable sizes and percentages, histogram # package # TODO: what else? # Copyright (c) 2005 Panasas, Inc. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA use Cwd; use Getopt::Long; use English; # files to process my @csv_files = (); process_args(); my $special_files = 0; my $skipped_hlink = 0; my $skipped_snapshot = 0; my $total_cap_used = 0; my $total_size = 0; my ($size_histo, $cap_histo, $pos_ovhd_histo, $neg_ovhd_histo, $dir_histo, $dirkb_histo, $fname_histo, $slink_histo, $hlink_histo, $mtime_files_histo, $mtime_bytes_histo, $ctime_files_histo, $ctime_bytes_histo, $atime_files_histo, $atime_bytes_histo); # initialize new histogram data structures (copied from fsstats program) $size_histo = Histo->new(min => 0, incr => 1, log_incr => 1, integer_vals => 0); $cap_histo = Histo->new(min => 0, incr => 1, log_incr => 1, integer_vals => 0); $pos_ovhd_histo = Histo->new(min => 0, incr => 1, log_incr => 1, integer_vals => 0); $neg_ovhd_histo = Histo->new(min => 0, incr => 1, log_incr => 1, integer_vals => 0); $dir_histo = Histo->new(min => 0, incr => 1, log_incr => 1); $dirkb_histo = Histo->new(min => 0, incr => 1, log_incr => 1, integer_vals => 0); #Be careful in choosing the value of 'max' for histos. #If the log_incr is set for them, the largest value+1 in the last bucket will be a power of 2. #If it is not set the largest value+1 in the last bucket will be some multiple of the 'max' value. #To be able to toggle log_incr off and on without having to change anything else 'max' should be #chosen carefully. Otherwise results may look wrong. #Hence if the max value chosen is 'n', then n+1 should be a power of 2. And n+1 should also be a #multiple of the 'incr' value. #Dont play with min values. # $fname_histo = Histo->new(min => 0, max => 120, incr => 8); $fname_histo = Histo->new(min => 0, max => 127, incr => 8); # $slink_histo = Histo->new(min => 0, max => 120, incr => 8); $slink_histo = Histo->new(min => 0, max => 127, incr => 8); $hlink_histo = Histo->new(min => 0, incr => 1, log_incr => 1); $mtime_files_histo = Histo->new(min => 0, incr => 1, log_incr => 1, integer_vals => 0); $mtime_bytes_histo = Histo->new(min => 0, incr => 1, log_incr => 1, integer_vals => 0); $ctime_files_histo = Histo->new(min => 0, incr => 1, log_incr => 1, integer_vals => 0); $ctime_bytes_histo = Histo->new(min => 0, incr => 1, log_incr => 1, integer_vals => 0); $atime_files_histo = Histo->new(min => 0, incr => 1, log_incr => 1, integer_vals => 0); $atime_bytes_histo = Histo->new(min => 0, incr => 1, log_incr => 1, integer_vals => 0); # loop through specified csv files foreach my $file ( @csv_files ) { open(CSV, "$file") || die("Error opening file $file: $!\n"); add_summary(CSV); add_histogram(CSV, $size_histo, "file size"); close(CSV); } print_output(); exit 0; sub print_output { printf("#Generated by fsstats-merge.pl (fsstats v1.4.5)\n"); printf("#Comment: This is a comment line that can be modified or repeated before\n"); printf("#uploading to record voluntarily added information.\n\n"); printf("skipped special files,%d\n", $special_files); printf("skipped duplicate hardlinks,%d\n", $skipped_hlink); printf("skipped snapshot dirs,%d\n", $skipped_snapshot); printf("total capacity used,%s\n", kb_to_print($total_cap_used)); printf("total user data,%s\n", kb_to_print($total_size)); printf("percent overhead,%f\n", ovhd_pct($total_size, $total_cap_used)/100); } sub process_args { use vars qw( $opt_help ); Getopt::Long::Configure("no_ignore_case", "bundling"); GetOptions( "help" ); if($opt_help) { print_help(); exit(0); } # there should be at least two remaining arguments (file names) if($#ARGV < 1) { print "Error: invalid arguments.\n"; print_help(); exit(1); } @csv_files = @ARGV; return; } sub print_help { print <) { if ($line =~ /Generated by fsstats v(.*) /) { $version = $1; } if ($line =~ /^skipped special files,(.*)/) {$special_files += $1;} if ($line =~ /^skipped duplicate hardlinks,(.*)/) {$skipped_hlink += $1;} if ($line =~ /^skipped snapshot dirs,(.*)/) {$skipped_snapshot += $1;} if ($line =~ /^total capacity used,(.*)/) {$total_cap_used += print_to_kb($1);} if ($line =~ /^total user data,(.*)/) {$total_size += print_to_kb($1);} } if($version ne "1.4.5") { die("Error: csv file not generated by fsstats v1.4.5.\n"); } seek($file, 0, 0); } # add_histogram() # # finds the specified histogram in a csv file and sums it to the existing # histogram data structure sub add_histogram { my $file = $_[0]; # open file handle # $_[1] is the data structure we are adding to my $name = $_[2]; # name of the histogram # save some global values; we need to replace/adjust those manually # after adding in the histogram because we no longer know the actual # value of each data point my $old_count = $_[1]->{count}; my $old_total_val = $_[1]->{total_val}; my $old_min_val = $_[1]->{min_val}; my $old_max_val = $_[1]->{max_val}; my $count = 0; my $average = 0; my $min = 0; my $max = 0; seek($file, 0, 0); while($line = <$file>) { # look for specified histogram if($line =~ /^histogram,$name$/) { while($line = <$file>) { if($line =~ /^count,(.*),/) {$count = $1;} elsif($line =~ /^average,(.*),/) {$average = $1;} elsif($line =~ /^min,(.*),/) {$min = $1;} elsif($line =~ /^max,(.*),/) {$max = $1;} elsif($line =~ /^bucket min,bucket max,/) {} # key elsif($line =~ /^([0-9]*\.?[0-9]*),([0-9]*\.?[0-9]*),([0-9]*\.?[0-9]*),([0-9]*\.?[0-9]*),([0-9]*\.?[0-9]*),([0-9]*\.?[0-9]*),([0-9]*\.?[0-9]*),([0-9]*\.?[0-9]*)/) { $_[1]->add(($6/$3), $3); } elsif($line =~ /^\s*$/) { # stop when we hit a blank line last; } else { print $line; die("Error: poorly formated csv file.\n"); } } last; } } # TODO: fix min/max seek($file, 0, 0); } # convert a printable value to KB sub print_to_kb { my ($arg) = @_; my ($value, $unit) = split(/ /, $arg); my $num = 0; if($unit eq "TB") { $num = $value * (1024*1024*1024); } elsif($unit eq "GB") { $num = $value * (1024*1024); } elsif($unit eq "MB") { $num = $value * (1024); } elsif($unit eq "KB") { $num = $value; } return $num; } # The routines in this section were taken directly from fsstats 1.4.5 by # Marc Unangst and Shobhit Dayal # ####################################################################### # Compute the percent overhead for a given capacity-used and size. # This method of computing overhead computes "percentage of the # capacity used that is overhead" and ranges from 0% (no overhead) to # 100% (size==0 and cap>0, space is all overhead). sub ovhd_pct { my ($size, $cap) = @_; if ($cap == 0) { return 0; } return (($cap - $size)/$cap)*100; } # convert a KB value to a "printable" value (GB, MB, or KB) depending # on its magnitude. returns a string suitable for printing. sub kb_to_print { my ($kb) = @_; my $num; my $unit; if($kb > 1024*1024*1024) { $num = $kb / (1024*1024*1024); $unit = "TB"; } elsif($kb > 1024*1024) { $num = $kb / (1024*1024); $unit = "GB"; } elsif($kb > 1024) { $num = $kb / 1024; $unit = "MB"; } else { $num = $kb; $unit = "KB"; } return sprintf("%.2f %s", $num, $unit); } ##### Histo.pm ##### # # Histo.pm # # Histogram module for Perl. # # Author: Marc Unangst # # Copyright (c) 2005 Panasas, Inc. All rights reserved. # use strict; package Histo; # # Constructor for a new Histo object. The arguments are a hash # of parameter/value pairs. The "min" and "incr" parameters # must be supplied. "max" and "log_incr" are optional. # sub new { my $type = shift; my %params = @_; my $self = {}; die "Histo->new: required parameters not set\n" unless (defined $params{min} && defined $params{incr}); $self->{min} = $params{min}; # $self->{max} = $params{max}-1 if defined $params{max}; $self->{max} = $params{max} if defined $params{max}; $self->{incr} = $params{incr}; if(defined $params{integer_vals}) { $self->{integer_vals} = $params{integer_vals}; } else { $self->{integer_vals} = 1; } $self->{count} = 0; $self->{total_val} = 0; if($params{log_incr}) { $self->{log_incr} = $params{log_incr}; $self->{bucket_max} = [$self->{min}+$self->{log_incr}]; } else { $self->{log_incr} = 0; } $self->{buckets} = []; $self->{buckets_val} = []; bless $self, $type; } # # Add a new data point to the histogram. # # @arg $val # Value to add to the histogram # @arg $count # Optional; if specified, the weight of the item being added. # Calling add($x, 3) is the same as calling add($x) three times. # sub add ($$;$) { my $self = shift; my ($val, $count) = @_; if(!defined $count) { $count = 1; } if(!defined $self->{min_val} || $val < $self->{min_val}) { $self->{min_val} = $val; } if(!defined $self->{max_val} || $val > $self->{max_val}) { $self->{max_val} = $val; } # if(int($val) != $val) { # $self->{integer_vals} = 0; # } $self->{count} += $count; $self->{total_val} += ($val*$count); #$self->{total_val} += $val; if(defined $self->{max} && $val > $self->{max}) { $self->{over_max} += $count; $self->{over_max_buckets_val} += $val*$count; } elsif($val < $self->{min}) { $self->{under_min} += $count; $self->{under_min_buckets_val} += $val*$count; } else { my $b; my $val_to_use = $val; # NOTE: not applicable in the fsstats-merge.pl script # if($self == $pos_ovhd_histo || $self == $neg_ovhd_histo) { # $val_to_use = $size; # } if($self->{log_incr}) { $b = 0; my $x = $self->{bucket_max}[0]; while($val_to_use >= $x+1) { $x = $x*2 + 1; $b++; if($b > $#{$self->{bucket_max}}) { $self->{bucket_max}[$b] = $x; } } } else { $b = int (($val_to_use - $self->{min}) / $self->{incr}); } #print STDERR "sample $val into bucket $b\n"; $self->{buckets}[$b] += $count; $self->{buckets_val}[$b] += $val*$count; if(!defined $self->{largest_bucket} || $self->{buckets}[$b] > $self->{largest_bucket}) { $self->{largest_bucket} = $self->{buckets}[$b]; } } } # # Get maximum value of the specified bucket. # # @arg $b # bucket number # # @internal # sub _get_bucket_max ($$) { my $self = shift; my ($b) = @_; # my $epsilon; dont need this # if($self->{integer_vals}) { # $epsilon = 1; # $epsilon = 0; # } # else { # $epsilon = 0.1; # } if($self->{log_incr}) { if($b <= $#{$self->{bucket_max}}) { # return ($self->{bucket_max}[$b]-$epsilon); return ($self->{bucket_max}[$b]); } else { return undef; } } else { #return ($self->{incr}*($b+1))-$epsilon; return (($self->{incr}*($b+1)) -1); } } # # Get minimum value of the specified bucket. # # @arg $b # bucket number # # @internal # sub _get_bucket_min ($$) { my $self = shift; my ($b) = @_; if($self->{log_incr}) { if($b == 0) { return $self->{min}; } elsif($b <= $#{$self->{bucket_max}}) { # return $self->{bucket_max}[$b-1] return $self->{bucket_max}[$b-1]+1; } else { return undef; } } else { return ($self->{min} + $self->{incr}*($b)); } } # # Print the histogram contents to STDOUT. # # @arg $prefix # String to prefix each output line with. # @arg $unit_str # String that describes the units of the histogram items. # sub print ($$$) { my $self = shift; my ($prefix, $unit_str) = @_; my $c = 0; my $d = 0; # my $prev_pct = 0; my $width; my $fmt; # if ($self->{integer_vals}) { $width = length sprintf("%d", $self->_get_bucket_max($#{$self->{buckets}})); $fmt = "d"; # } # else { # $width = length sprintf("%.1f", $self->_get_bucket_max($#{$self->{buckets}})); # $fmt = ".1f" # } my $bwidth = 0; if (defined $self->{largest_bucket}) { $bwidth = length sprintf("%d", $self->{largest_bucket}); } if($bwidth < 5) { $bwidth = 5; } my $bwidth_val = length sprintf("%.2f", $self->{total_val}); printf("%scount=%d avg=%.2f %s\n", $prefix, $self->{count}, $self->{count} > 0 ? $self->{total_val} / $self->{count} : 0, $unit_str); my ($min_val, $max_val); $min_val = defined $self->{min_val} ? $self->{min_val} : "0"; $max_val = defined $self->{max_val} ? $self->{max_val} : "0"; printf("%smin=%.2f %s max=%.2f %s\n", $prefix, $min_val, $unit_str, $max_val, $unit_str); if(defined $self->{under_min} && $self->{under_min} > 0) { $c += $self->{under_min}; $d += $self->{under_min_buckets_val}; printf("%s[%${width}s<%${width}${fmt} %s]: %${bwidth}d (%5.2f%%) (%6.2f%% cumulative) %${bwidth_val}.2f %s (%5.2f%%) (%6.2f%% cumulative)\n", $prefix, " ", $self->{min}, $unit_str, $c, ($c/$self->{count})*100, ($c/$self->{count})*100, $d, $unit_str, ($d/$self->{total_val})*100, ($d/$self->{total_val})*100); } for(my $b = 0; $b <= $#{$self->{buckets}}; $b++) { if($self->{buckets}->[$b]) { my $x = $self->{buckets}->[$b]; my $y = $self->{buckets_val}->[$b]; $c += $x; $d += $y; my $pct = ($x / $self->{count}) * 100; my $cum_pct = ($c / $self->{count}) * 100; # if all the files parsed are zero bytes, the total_val will be zero but count will be a positive number my $y_pct = 0; my $y_cum_pct = 0; if($self->{total_val}) { $y_pct = ($y / $self->{total_val}) * 100; $y_cum_pct = ($d / $self->{total_val}) * 100; } if ($self->{integer_vals}) { printf("%s[%${width}${fmt}-%${width}${fmt} %s]: %${bwidth}d (%5.2f%%) (%6.2f%% cumulative) %${bwidth_val}.2f %s (%5.2f%%) (%6.2f%% cumulative) \n", $prefix, $self->_get_bucket_min($b), $self->_get_bucket_max($b), $unit_str, $x, $pct, $cum_pct, $y, $unit_str, $y_pct, $y_cum_pct); }else { printf("%s[%${width}${fmt}-%${width}${fmt} %s): %${bwidth}d (%5.2f%%) (%6.2f%% cumulative) %${bwidth_val}.2f %s (%5.2f%%) (%6.2f%% cumulative)\n", $prefix, $self->_get_bucket_min($b), $self->_get_bucket_max($b)+1, $unit_str, $x, $pct, $cum_pct, $y, $unit_str, $y_pct, $y_cum_pct); } # $prev_pct = $cum_pct; } } if(defined $self->{over_max} && $self->{over_max} > 0) { $c += $self->{over_max}; $d += $self->{over_max_buckets_val}; printf("%s[%${width}s>%${width}${fmt} %s]: %${bwidth}d (%5.2f%%) (%6.2f%% cumulative) %${bwidth_val}.2f %s (%5.2f%%) (%6.2f%% cumulative)\n", $prefix, " ", $self->{max}, $unit_str, $self->{over_max}, ($self->{over_max} / $self->{count})*100, ($c / $self->{count})*100, $self->{over_max_buckets_val}, $unit_str, ($self->{over_max_buckets_val} / $self->{total_val})*100, ($d/$self->{total_val})*100); } } # # Print histogram contents to a CSV-format file. # # @arg $fh # filehandle to print to # @arg $name # descriptive name of this histogram, to identify it in the file # @arg $unit_str # string that describes the units of the histogram items # sub print_csv { my $self = shift; my ($fh, $name, $unit_str) = @_; my $c = 0; my $d = 0; printf($fh "histogram,%s\n", $name); printf($fh "count,%d,items\n", $self->{count}); printf($fh "average,%f,%s\n", $self->{count} > 0 ? $self->{total_val} / $self->{count} : 0, $unit_str); my ($min_val, $max_val); $min_val = defined $self->{min_val} ? $self->{min_val} : "0"; $max_val = defined $self->{max_val} ? $self->{max_val} : "0"; printf($fh "min,%d,%s\n", $min_val, $unit_str); printf($fh "max,%d,%s\n", $max_val, $unit_str); print $fh "bucket min,bucket max,count,percent,cumulative pct,val count,percent,cumulative pct\n"; if (defined $self->{under_min} && $self->{under_min} > 0) { $c += $self->{under_min}; $d += $self->{under_min_buckets_val}; printf($fh "%d,%d,%d,%f,%f,%f,%f,%f\n", -1, $self->{min}, $c, $c/$self->{count}, $c/$self->{count}, $d, $d/$self->{total_val}, $d/$self->{total_val}); } for (my $b = 0; $b <= $#{$self->{buckets}}; $b++) { if (defined $self->{buckets}->[$b] && $self->{buckets}->[$b] != 0) { my $x = $self->{buckets}->[$b]; my $y = $self->{buckets_val}->[$b]; $c += $x; $d += $y; my $pct = $x / $self->{count}; my $cum_pct = $c / $self->{count}; # if all the files parsed are zero bytes, the total_val will be zero but count will be a positive number my $y_pct = 0; my $y_cum_pct = 0; if($self->{total_val}) { $y_pct = $y / $self->{total_val}; $y_cum_pct = $d / $self->{total_val}; } if($self->{integer_vals}) { printf($fh "%d,%d,%d,%f,%f,%f,%f,%f\n", $self->_get_bucket_min($b), $self->_get_bucket_max($b), $x, $pct, $cum_pct, $y, $y_pct, $y_cum_pct); } else { printf($fh "%d,%d,%d,%f,%f,%f,%f,%f\n", $self->_get_bucket_min($b), $self->_get_bucket_max($b)+1, $x, $pct, $cum_pct, $y, $y_pct, $y_cum_pct); } } } if (defined $self->{over_max} && $self->{over_max} > 0) { $c += $self->{over_max}; $d += $self->{over_max_buckets_val}; printf($fh "%d,%d,%d,%f,%f,%f,%f,%f\n", $self->{max}, -1, $self->{over_max}, $self->{over_max}/$self->{count}, $c/$self->{count}, $self->{over_max_buckets_val}, $self->{over_max_buckets_val}/$self->{total_val}, $d/$self->{total_val}); } print $fh "\n"; } __END__ #######################################################################