tools/cmd/srecres2utd.pl

#!/usr/localbin/perl

use Getopt::Long;
use File::Basename;
use lib dirname($0);

$assume_invocab = 0; # _when_semantics_missing
$rc = GetOptions("add=s" => \@additional_fields,
		 "invocab" => \$assume_invocab,
		 "quiet" => \$quiet,
		 "semantic" => \$try_semantic_validation,
		 "altsem=s" => \$altsemfile,
                 );

my @fields = ("file", "correct", "invocab", "gdiff", "sd", "sd13", "spf", "abs", "gdiffpf", "rejrslt", "rankc", "match", "ortho", "choice1", "choice2", "score1", "conf", "gender");

if($try_semantic_validation) {
    push(@additional_fields,"parsed_ortho");
}
push(@fields, @additional_fields);
foreach $additional_field (@additional_fields) {
    $additional_fieldh{$additional_field}++;
}

load_altsemfile($altsemfile) if($altsemfile);

$| = 1;

if(@ARGV[0] =~ /^@/) {
    $flist = substr($ARGV[0],1);
    @resfiles = `cat $flist`;
    grep { s/\s+$// } @resfiles;
} else {
    @resfiles = @ARGV;
}

foreach $resfile (@resfiles) {
    ($base = $resfile) =~ s/\.[a-z]+$//i;
    $utdfile = "$base.utd";

    # print "processing $resfile to $utdfile\n" unless($quiet);
    open(RES, "<$resfile") || die "error opening $resfile\n";
    open(UTD, ">$utdfile") || die "error opening $utdfile\n";
    $hUTD = \*UTD;
    undef %results;
    while(<RES>) {
	s/\s+$//;
        s/^\s+//;
	if(/^D:\s+(\S+)\s*$/) { # same as CREC
	    $file = $1;
	    if(defined %token) {
		process(\%token, \%results);
		dump_record($hUTD, \%token);
	    } else {
		dump_header($hUTD);
	    }
	    undef %token;
	    $token{file} = $file;
	    $file =~ /ENU-(\d\d\d)-/;
	    $token{gender} = $gender{$1};
	    $token{"snr"} = get_snr($file) if($additional_fieldh{"snr"});
	    $token{"snrr"} = sprintf("%.2d",int(get_snr($file)/5+0.5)*5) if($additional_fieldh{"snrr"});
	} elsif(/^C:\s+(.*)$/) { # same as CREC
	    $token{ortho} = normalize($1);
	} elsif(/^\s*(\S+) = (.*)$/) {
	    ($augkey,$augval) = ($1,$2);
	    if($augkey eq "feedback") {
		$token{parsed_ortho} = $augval;
		$token{invocab}++;
	    }
	} elsif(/^R:\s+(.*)$/) { # same as CREC
	    if(/<rejected/i || /<FAILED/i) {
		$token{rejrslt} = "f";
	    } else {
		# $token{topchoice} = $1;
		$token{rejrslt} = "a";
	    }
	} elsif(/^Sem[^:]+:  invocab=(\d)/) { # same as CREC
	    $token{invocab} = 1;
	} elsif(/^CSem:\s+([a-z]+.*)\s*$/i) {
	    $token{parsed_ortho} = $1;
	} elsif(/^Sem:(\s+)(\S+)/) { # same as CREC
	    $token{invocab} = 0;
	} elsif(/^LITERAL\[\s*0\]\s*:\s*\'(.*)\'/) {
	    $choice = $1;
	    $token{choices}[0] = $choice;
	} elsif(/^LITERAL\[\s*(\d+)\]\s+:\s+\'(.*)\'/) {
	    $i = $1;
	    $choice = $2;
	    /.*\: \'(.*)\'/;
	    $choice = $1;
	    $token{choices}[$i] = $choice;
	} elsif(/^MEANING\[\s*(\d+)\]\s+:\s+\'(.*)\'/) {
	    $i = $1;
	    $choice = $2;
	    /.*\: \'(.*)\'/;
	    $choice = $1;
	    $choice =~ s/\s+$//;
	    $token{meanings}[$i] = $choice;
	} elsif(/^LITERAL\[(\d+)\]\[(\d+)\]\s+:\s+\'(.*)\'/) {
	    $i = $1;
	    $score = $2;
	    $token{scores}[$i] = $score;
	} elsif(/^RAW SCORE\s+:\s+\'(.*)\'/) {
	    $token{topscore} = $1;
	} elsif(/^gdiff\s+(.*)$/){
            $token{gdiff} = $1;
        } elsif(/^sd13\s+(.*)$/){
            $token{sd13} = $1;
        } elsif(/^spf\s+(.*)$/){
            $token{spf} = $1;
        } elsif(/^abs\s+(.*)$/){
            $token{abs} = $1;
        } elsif(/^gdiffpf\s+(.*)$/){
            $token{gdiffpf} = $1;
        } elsif(/^sd\s+(.*)$/){
            $token{sd} = $1;
        } elsif(/^CONFIDENCE SCORE\s+:\s+\'(.*)\'/) {
            $token{conf} = $1;
        }
    }
    process(\%token, \%results) if(defined %token);
    dump_record($hUTD, \%token) if(defined %token);
    close(UTD);
    close(RES);
    undef %token;
    $results{total} ||= 1;
    $rr = $results{correct}/$results{total} * 100;
    $rr = int($rr*10 + 0.5)/10;
    print sprintf("%-45s RR %4.1f %d/%d (%d oovs)\n", $base, $rr, $results{correct}, $results{total}, $results{numoovs});
}


sub process
{
    my $token = shift(@_);
    my $results = shift(@_);
    $token->{invocab} = 1 if($assume_invocab);
    if(defined $token{topchoice}) {
	$token->{choices}[0] = $token{topchoice};
    }
    if(defined $token{topscore}) {
	$token->{scores}[0] = $token{topscore};
    }
    my $ortho = lc($token->{ortho});
    my $topch = lc($token->{choices}[0]);

    $ortho =~ s/_/ /g;
    $topch =~ s/_/ /g;
    $topch =~ s/\s\s+/ /g;
    $ortho =~ s/\s\s+/ /g;
    if($token->{invocab} == 0) {
	$token->{correct} = "0";
	$results->{numoovs}++;
    } elsif($topch eq $ortho) {
	$results->{total}++;
	$results->{correct}++;
	$token->{correct} = "1";
    } else {
	$results->{total}++;
	# print "$token->{file} MEANINGCMP: ==$token->{meanings}[0]== ==$token->{parsed_ortho}==\n";
	if($altsemfile) {
	    if($token->{parsed_ortho} ne $csemtags{$token->{file}}) {
		# print "changing $token{parsed_ortho} ne $csemtags{$token->{file}}\n";
		$token->{parsed_ortho} = $csemtags{$token->{file}};
	    }
	}

	if(not $try_semantic_validation) {
	    $token->{correct} = "0";
	} else {
	    if($token->{meanings}[0] eq $token->{parsed_ortho} && length($token->{parsed_ortho})>0) {
		$token->{correct} = "1";
		$results->{correct}++ ;
	    } else {
		$token->{correct} = "0";
	    }
	}
    }
    $token->{rankc} = 0;
    my $nchoices = scalar(@{$token->{choices}});
    for($i=0; $i<$nchoices; $i++) {
	my $choice = lc $token->{choices}[$i];
	$choice =~ s/_/ /g;
	if($choice eq $ortho) {
	    $token->{rankc} = $i+1;
	    last;
	}
    }
    $token->{gender} = "?";
}

sub dump_record
{
    my $HH = shift(@_);
    my $token = shift(@_);
    foreach $field (@fields) {
          if ($field =~ /^sd13$/){
          print UTD "$token->{$field}" , ":";
	} elsif($field =~ /^(\S+)(\d+)$/) {
	  $name = "${1}s";
	  $num = $2 - 1;
	  print UTD "$token->{$name}[$num]", ":";
	} else{
          print UTD "$token->{$field}" , ":";
	}
    }
    print UTD "\n";
}

sub dump_header
{
    my $HH = shift(@_);
    foreach $field (@fields) {
	print UTD "$field" , ":";
    }
    print UTD "\n";
}

sub normalize
{
    my $k = shift(@_);
    $k =~ s/\s\s+/ /g;
    $k =~ s/\:/\;/g;
    $k =~ s/\[[^\]]+\]//g;
    $k =~ s/^\s+//g;
    $k =~ s/\s+$//g;
    return $k;
}

sub load_altsemfile
{
    my $semfile = shift(@_);
    open(SM,"<$semfile") || die "error: opening $semfile\n";
    while(<SM>) {
	if(/D: (\S+)$/) {
	    $file = $1;
	    $file =~ s/\s+$//;
	} elsif(/^CSem:\s+([a-z]+.*)\s*$/i) {
	    $csemtags{$file} = $1;
	    $csemtags{$file} =~ s/\s+$//;
	} elsif(/^Sem[^:]+:  invocab=(\d)/) { # same as CREC
	    $semtags{$file} = 1;
	} elsif(/^Sem:(\s+)(\S+)/) { # same as CREC
	    $semtags{$file} = 0;
	}
    }
    close(SM);
}