1#/**
2# *******************************************************************************
3# * Copyright (C) 2002-2004, International Business Machines Corporation and    *
4# * others. All Rights Reserved.                                                *
5# *******************************************************************************
6# */
7package Dataset;
8use Statistics::Descriptive;
9use Statistics::Distributions;
10use strict;
11
12# Create a new Dataset with the given data.
13sub new {
14    my ($class) = shift;
15    my $self = bless {
16        _data => \@_,
17        _scale => 1.0,
18        _mean => 0.0,
19        _error => 0.0,
20    }, $class;
21
22    my $n = @_;
23
24    if ($n >= 1) {
25        my $stats = Statistics::Descriptive::Full->new();
26        $stats->add_data(@{$self->{_data}});
27        $self->{_mean} = $stats->mean();
28
29        if ($n >= 2) {
30            # Use a t distribution rather than Gaussian because (a) we
31            # assume an underlying normal dist, (b) we do not know the
32            # standard deviation -- we estimate it from the data, and (c)
33            # we MAY have a small sample size (also works for large n).
34            my $t = Statistics::Distributions::tdistr($n-1, 0.005);
35            $self->{_error} = $t * $stats->standard_deviation();
36        }
37    }
38
39    $self;
40}
41
42# Set a scaling factor for all data; 1.0 means no scaling.
43# Scale must be > 0.
44sub setScale {
45    my ($self, $scale) = @_;
46    $self->{_scale} = $scale;
47}
48
49# Multiply the scaling factor by a value.
50sub scaleBy {
51    my ($self, $a) = @_;
52    $self->{_scale} *= $a;
53}
54
55# Return the mean.
56sub getMean {
57    my $self = shift;
58    return $self->{_mean} * $self->{_scale};
59}
60
61# Return a 99% error based on the t distribution.  The dataset
62# is desribed as getMean() +/- getError().
63sub getError {
64    my $self = shift;
65    return $self->{_error} * $self->{_scale};
66}
67
68# Divide two Datasets and return a new one, maintaining the
69# mean+/-error.  The new Dataset has no data points.
70sub divide {
71    my $self = shift;
72    my $rhs = shift;
73
74    my $minratio = ($self->{_mean} - $self->{_error}) /
75                   ($rhs->{_mean} + $rhs->{_error});
76    my $maxratio = ($self->{_mean} + $self->{_error}) /
77                   ($rhs->{_mean} - $rhs->{_error});
78
79    my $result = Dataset->new();
80    $result->{_mean} = ($minratio + $maxratio) / 2;
81    $result->{_error} = $result->{_mean} - $minratio;
82    $result->{_scale} = $self->{_scale} / $rhs->{_scale};
83    $result;
84}
85
861;
87