1#!/usr/localbin/perl
2
3use Getopt::Long;
4use File::Basename;
5use lib dirname($0);
6
7$assume_invocab = 0; # _when_semantics_missing
8$rc = GetOptions("add=s" => \@additional_fields,
9		 "invocab" => \$assume_invocab,
10		 "quiet" => \$quiet,
11		 "semantic" => \$try_semantic_validation,
12		 "altsem=s" => \$altsemfile,
13                 );
14
15my @fields = ("file", "correct", "invocab", "gdiff", "sd", "sd13", "spf", "abs", "gdiffpf", "rejrslt", "rankc", "match", "ortho", "choice1", "choice2", "score1", "conf", "gender");
16
17if($try_semantic_validation) {
18    push(@additional_fields,"parsed_ortho");
19}
20push(@fields, @additional_fields);
21foreach $additional_field (@additional_fields) {
22    $additional_fieldh{$additional_field}++;
23}
24
25load_altsemfile($altsemfile) if($altsemfile);
26
27$| = 1;
28
29if(@ARGV[0] =~ /^@/) {
30    $flist = substr($ARGV[0],1);
31    @resfiles = `cat $flist`;
32    grep { s/\s+$// } @resfiles;
33} else {
34    @resfiles = @ARGV;
35}
36
37foreach $resfile (@resfiles) {
38    ($base = $resfile) =~ s/\.[a-z]+$//i;
39    $utdfile = "$base.utd";
40
41    # print "processing $resfile to $utdfile\n" unless($quiet);
42    open(RES, "<$resfile") || die "error opening $resfile\n";
43    open(UTD, ">$utdfile") || die "error opening $utdfile\n";
44    $hUTD = \*UTD;
45    undef %results;
46    while(<RES>) {
47	s/\s+$//;
48        s/^\s+//;
49	if(/^D:\s+(\S+)\s*$/) { # same as CREC
50	    $file = $1;
51	    if(defined %token) {
52		process(\%token, \%results);
53		dump_record($hUTD, \%token);
54	    } else {
55		dump_header($hUTD);
56	    }
57	    undef %token;
58	    $token{file} = $file;
59	    $file =~ /ENU-(\d\d\d)-/;
60	    $token{gender} = $gender{$1};
61	    $token{"snr"} = get_snr($file) if($additional_fieldh{"snr"});
62	    $token{"snrr"} = sprintf("%.2d",int(get_snr($file)/5+0.5)*5) if($additional_fieldh{"snrr"});
63	} elsif(/^C:\s+(.*)$/) { # same as CREC
64	    $token{ortho} = normalize($1);
65	} elsif(/^\s*(\S+) = (.*)$/) {
66	    ($augkey,$augval) = ($1,$2);
67	    if($augkey eq "feedback") {
68		$token{parsed_ortho} = $augval;
69		$token{invocab}++;
70	    }
71	} elsif(/^R:\s+(.*)$/) { # same as CREC
72	    if(/<rejected/i || /<FAILED/i) {
73		$token{rejrslt} = "f";
74	    } else {
75		# $token{topchoice} = $1;
76		$token{rejrslt} = "a";
77	    }
78	} elsif(/^Sem[^:]+:  invocab=(\d)/) { # same as CREC
79	    $token{invocab} = 1;
80	} elsif(/^CSem:\s+([a-z]+.*)\s*$/i) {
81	    $token{parsed_ortho} = $1;
82	} elsif(/^Sem:(\s+)(\S+)/) { # same as CREC
83	    $token{invocab} = 0;
84	} elsif(/^LITERAL\[\s*0\]\s*:\s*\'(.*)\'/) {
85	    $choice = $1;
86	    $token{choices}[0] = $choice;
87	} elsif(/^LITERAL\[\s*(\d+)\]\s+:\s+\'(.*)\'/) {
88	    $i = $1;
89	    $choice = $2;
90	    /.*\: \'(.*)\'/;
91	    $choice = $1;
92	    $token{choices}[$i] = $choice;
93	} elsif(/^MEANING\[\s*(\d+)\]\s+:\s+\'(.*)\'/) {
94	    $i = $1;
95	    $choice = $2;
96	    /.*\: \'(.*)\'/;
97	    $choice = $1;
98	    $choice =~ s/\s+$//;
99	    $token{meanings}[$i] = $choice;
100	} elsif(/^LITERAL\[(\d+)\]\[(\d+)\]\s+:\s+\'(.*)\'/) {
101	    $i = $1;
102	    $score = $2;
103	    $token{scores}[$i] = $score;
104	} elsif(/^RAW SCORE\s+:\s+\'(.*)\'/) {
105	    $token{topscore} = $1;
106	} elsif(/^gdiff\s+(.*)$/){
107            $token{gdiff} = $1;
108        } elsif(/^sd13\s+(.*)$/){
109            $token{sd13} = $1;
110        } elsif(/^spf\s+(.*)$/){
111            $token{spf} = $1;
112        } elsif(/^abs\s+(.*)$/){
113            $token{abs} = $1;
114        } elsif(/^gdiffpf\s+(.*)$/){
115            $token{gdiffpf} = $1;
116        } elsif(/^sd\s+(.*)$/){
117            $token{sd} = $1;
118        } elsif(/^CONFIDENCE SCORE\s+:\s+\'(.*)\'/) {
119            $token{conf} = $1;
120        }
121    }
122    process(\%token, \%results) if(defined %token);
123    dump_record($hUTD, \%token) if(defined %token);
124    close(UTD);
125    close(RES);
126    undef %token;
127    $results{total} ||= 1;
128    $rr = $results{correct}/$results{total} * 100;
129    $rr = int($rr*10 + 0.5)/10;
130    print sprintf("%-45s RR %4.1f %d/%d (%d oovs)\n", $base, $rr, $results{correct}, $results{total}, $results{numoovs});
131}
132
133
134sub process
135{
136    my $token = shift(@_);
137    my $results = shift(@_);
138    $token->{invocab} = 1 if($assume_invocab);
139    if(defined $token{topchoice}) {
140	$token->{choices}[0] = $token{topchoice};
141    }
142    if(defined $token{topscore}) {
143	$token->{scores}[0] = $token{topscore};
144    }
145    my $ortho = lc($token->{ortho});
146    my $topch = lc($token->{choices}[0]);
147
148    $ortho =~ s/_/ /g;
149    $topch =~ s/_/ /g;
150    $topch =~ s/\s\s+/ /g;
151    $ortho =~ s/\s\s+/ /g;
152    if($token->{invocab} == 0) {
153	$token->{correct} = "0";
154	$results->{numoovs}++;
155    } elsif($topch eq $ortho) {
156	$results->{total}++;
157	$results->{correct}++;
158	$token->{correct} = "1";
159    } else {
160	$results->{total}++;
161	# print "$token->{file} MEANINGCMP: ==$token->{meanings}[0]== ==$token->{parsed_ortho}==\n";
162	if($altsemfile) {
163	    if($token->{parsed_ortho} ne $csemtags{$token->{file}}) {
164		# print "changing $token{parsed_ortho} ne $csemtags{$token->{file}}\n";
165		$token->{parsed_ortho} = $csemtags{$token->{file}};
166	    }
167	}
168
169	if(not $try_semantic_validation) {
170	    $token->{correct} = "0";
171	} else {
172	    if($token->{meanings}[0] eq $token->{parsed_ortho} && length($token->{parsed_ortho})>0) {
173		$token->{correct} = "1";
174		$results->{correct}++ ;
175	    } else {
176		$token->{correct} = "0";
177	    }
178	}
179    }
180    $token->{rankc} = 0;
181    my $nchoices = scalar(@{$token->{choices}});
182    for($i=0; $i<$nchoices; $i++) {
183	my $choice = lc $token->{choices}[$i];
184	$choice =~ s/_/ /g;
185	if($choice eq $ortho) {
186	    $token->{rankc} = $i+1;
187	    last;
188	}
189    }
190    $token->{gender} = "?";
191}
192
193sub dump_record
194{
195    my $HH = shift(@_);
196    my $token = shift(@_);
197    foreach $field (@fields) {
198          if ($field =~ /^sd13$/){
199          print UTD "$token->{$field}" , ":";
200	} elsif($field =~ /^(\S+)(\d+)$/) {
201	  $name = "${1}s";
202	  $num = $2 - 1;
203	  print UTD "$token->{$name}[$num]", ":";
204	} else{
205          print UTD "$token->{$field}" , ":";
206	}
207    }
208    print UTD "\n";
209}
210
211sub dump_header
212{
213    my $HH = shift(@_);
214    foreach $field (@fields) {
215	print UTD "$field" , ":";
216    }
217    print UTD "\n";
218}
219
220sub normalize
221{
222    my $k = shift(@_);
223    $k =~ s/\s\s+/ /g;
224    $k =~ s/\:/\;/g;
225    $k =~ s/\[[^\]]+\]//g;
226    $k =~ s/^\s+//g;
227    $k =~ s/\s+$//g;
228    return $k;
229}
230
231sub load_altsemfile
232{
233    my $semfile = shift(@_);
234    open(SM,"<$semfile") || die "error: opening $semfile\n";
235    while(<SM>) {
236	if(/D: (\S+)$/) {
237	    $file = $1;
238	    $file =~ s/\s+$//;
239	} elsif(/^CSem:\s+([a-z]+.*)\s*$/i) {
240	    $csemtags{$file} = $1;
241	    $csemtags{$file} =~ s/\s+$//;
242	} elsif(/^Sem[^:]+:  invocab=(\d)/) { # same as CREC
243	    $semtags{$file} = 1;
244	} elsif(/^Sem:(\s+)(\S+)/) { # same as CREC
245	    $semtags{$file} = 0;
246	}
247    }
248    close(SM);
249}
250
251