1#/**
2# * © 2016 and later: Unicode, Inc. and others.
3# * License & terms of use: http://www.unicode.org/copyright.html#License
4# *******************************************************************************
5# * Copyright (C) 2002-2004, International Business Machines Corporation and    *
6# * others. All Rights Reserved.                                                *
7# *******************************************************************************
8# */
9package Dataset;
10use Statistics::Descriptive;
11use Statistics::Distributions;
12use strict;
13
14# Create a new Dataset with the given data.
15sub new {
16    my ($class) = shift;
17    my $self = bless {
18        _data => \@_,
19        _scale => 1.0,
20        _mean => 0.0,
21        _error => 0.0,
22    }, $class;
23
24    my $n = @_;
25
26    if ($n >= 1) {
27        my $stats = Statistics::Descriptive::Full->new();
28        $stats->add_data(@{$self->{_data}});
29        $self->{_mean} = $stats->mean();
30
31        if ($n >= 2) {
32            # Use a t distribution rather than Gaussian because (a) we
33            # assume an underlying normal dist, (b) we do not know the
34            # standard deviation -- we estimate it from the data, and (c)
35            # we MAY have a small sample size (also works for large n).
36            my $t = Statistics::Distributions::tdistr($n-1, 0.005);
37            $self->{_error} = $t * $stats->standard_deviation();
38        }
39    }
40
41    $self;
42}
43
44# Set a scaling factor for all data; 1.0 means no scaling.
45# Scale must be > 0.
46sub setScale {
47    my ($self, $scale) = @_;
48    $self->{_scale} = $scale;
49}
50
51# Multiply the scaling factor by a value.
52sub scaleBy {
53    my ($self, $a) = @_;
54    $self->{_scale} *= $a;
55}
56
57# Return the mean.
58sub getMean {
59    my $self = shift;
60    return $self->{_mean} * $self->{_scale};
61}
62
63# Return a 99% error based on the t distribution.  The dataset
64# is desribed as getMean() +/- getError().
65sub getError {
66    my $self = shift;
67    return $self->{_error} * $self->{_scale};
68}
69
70# Divide two Datasets and return a new one, maintaining the
71# mean+/-error.  The new Dataset has no data points.
72sub divide {
73    my $self = shift;
74    my $rhs = shift;
75
76    my $minratio = ($self->{_mean} - $self->{_error}) /
77                   ($rhs->{_mean} + $rhs->{_error});
78    my $maxratio = ($self->{_mean} + $self->{_error}) /
79                   ($rhs->{_mean} - $rhs->{_error});
80
81    my $result = Dataset->new();
82    $result->{_mean} = ($minratio + $maxratio) / 2;
83    $result->{_error} = $result->{_mean} - $minratio;
84    $result->{_scale} = $self->{_scale} / $rhs->{_scale};
85    $result;
86}
87
881;
89