1############################################################################# 2# Perl script genlingware.pl --- composes a lingware resource from 3# pico knowledge base binary files (pkb) 4# according to given configuration 5# 6# Copyright (C) 2009 SVOX AG. All Rights Reserved. 7# 8# type perl genlingware.pl -help to get help 9# 10############################################################################# 11eval "exec perl -S \$0 \${1+\"\$@\"}" 12 if 0; 13 14$resource_structure = <<EOSTRUCT 15 16 17the resource file structure is as follows: 18------------------------------ 191. optional foreign header (4-byte aligned), externally added 20------------------------------ 21A) Pico header (4-byte aligned) 22 2. SVOX PICO header (signature) 23 3. length of header (2 byte, excluding length itself) 24 4. number of fields (1 byte) 25 5. header fields (space separated) (key/value pairs) 26 6. filler (0-3) 27------------------------------ 28B) length of content 297. length of the remaining content in 4-byte, excluding length itself 30------------------------------ 31C) Index 32 8. summary: number of kbs 33 9. id names of kb (strings of max 15 chars plus closing space) 34 10. directory (kb id (1 byte), offset (4 bytes), size (4 bytes)) 35 11. filler (0-3) 36------------------------------ 37D) knowledge bases 3812. sequence of knowledge bases (byte arrays), each 4-byte aligned 39 40all numbers are little endian 41EOSTRUCT 42 ; 43 44 45 46 47 48################################################################### 49## 50## Imports 51## 52################################################################### 53#use File::DosGlob 'glob'; 54#use File::Copy; 55#use File::Path; 56#use File::Basename; 57#use Filehandle; 58#use Time::Local; 59use Getopt::Long; 60################################################################### 61## 62## Default values 63## 64################################################################### 65$VALUE = 1; 66$NAME = "name"; 67$DEST = "."; 68################################################################### 69## 70## Options 71## 72################################################################### 73GetOptions( 74 "value=i" => \$VALUE, # numeric 75 "name=s" => \$NAME, # string 76 "help" => \$HELP 77 ); 78################################################################### 79## 80## Help 81## 82################################################################### 83$help = <<EOHELP 84 $0 -- composes a lingware resource from pico knowledge base 85 binary files (pkb) according to given configuration 86 87 Usage: 88 $0 -help 89 90 print this help 91 92 $0 <config> <resource> 93 94 reads in configuration file <config> and creates resource <resource> 95 96 Arguments: 97 <config> : configuration file (input) 98 <resource> : platform-independent resource file (output) 99 100 101 (For more details see the source of this script) 102EOHELP 103 ; 104die $help if $HELP; 105 106$config_example = <<EOCONFIGEXAMPLE 107 108Example of a config: 109 110------------------------------------------------------------------- 111\# collection of de-DE textana knowledge bases 112 113\# header fields: 114 115NAME de-DE_ta_1.0.0.0-0-2 116VERSION 1.0.0.0-0-2 117DATE 2009-01-15 118TIME 17:00:00.000 119CONTENT_TYPE TEXTANA 120 121 122\# pico knowledge bases: 123 124TPP_MAIN \"../pkb/de-DE/de-DE_kpr.pkb\" 125TAB_GRAPHS \"../pkb/de-DE/de-DE_ktab_graphs.pkb\" 126... 127 128------------------------------------------------------------------- 129 130for all recognized pkb tags, see %picoknow_kb_id below 131 132EOCONFIGEXAMPLE 133; 134 135 136################################################################### 137## 138## Initialization 139## 140################################################################### 141$svoxheader = " (C) SVOX AG "; 142 143%header_field = ( 144 "NAME" => 1, 145 "VERSION" => 2, 146 "DATE" => 3, 147 "TIME" => 4, 148 "CONTENT_TYPE" => 5, 149); 150 151%picoknow_kb_id = ( 152 NULL => 0, 153#base 154 TAB_GRAPHS => 2, 155 TAB_PHONES => 3, 156 TAB_POS => 4, 157 # FIXED_IDS = 7, 158#dbg 159 DBG => 8, 160 161#textana 162 TPP_MAIN => 1, 163 LEX_MAIN => 9, 164 DT_POSP => 10, 165 DT_POSD => 11, 166 DT_G2P => 12, 167 FST_WPHO_1 => 13, 168 FST_WPHO_2 => 14, 169 FST_WPHO_3 => 15, 170 FST_WPHO_4 => 16, 171 FST_WPHO_5 => 17, 172 DT_PHR => 18, 173 DT_ACC => 19, 174 FST_SPHO_1 => 20, 175 FST_SPHO_2 => 21, 176 FST_SPHO_3 => 22, 177 FST_SPHO_4 => 23, 178 FST_SPHO_5 => 24, 179 FST_XSPA_PARSE => 25, 180 FST_SVPA_PARSE => 26, 181 FST_XS2SVPA => 27, 182 183 FST_SPHO_6 => 28, 184 FST_SPHO_7 => 29, 185 FST_SPHO_8 => 30, 186 FST_SPHO_9 => 31, 187 FST_SPHO_10 => 32, 188 189#siggen 190 DT_DUR => 34, 191 DT_LFZ1 => 35, 192 DT_LFZ2 => 36, 193 DT_LFZ3 => 37, 194 DT_LFZ4 => 38, 195 DT_LFZ5 => 39, 196 DT_MGC1 => 40, 197 DT_MGC2 => 41, 198 DT_MGC3 => 42, 199 DT_MGC4 => 43, 200 DT_MGC5 => 44, 201 PDF_DUR => 45, 202 PDF_LFZ => 46, 203 PDF_MGC => 47, 204 PDF_PHS => 48, 205 206#user tpp 207 TPP_USER1 => 49, 208 TPP_USER2 => 50, 209#user lex 210 LEX_USER1 => 57, 211 LEX_USER2 => 58, 212 213 DUMMY => 127 214 ); 215 216 217 218 219################################################################### 220## 221## Get Parameters 222## 223################################################################### 224($in,$out) = (shift,shift); 225 226unless ($in && $out) { 227 print "*** error: incorrect number of parameters\n"; 228 die $help; 229} 230 231 232################################################################### 233## 234## Work 235## 236################################################################### 237 238#get description of kbs 239 240unless (open IN, $in) { 241 print "*** error: can't open $in\n"; 242 die "can't open $in"; 243} 244while (<IN>) { 245 # skip comments 246 next if /^\s*#/; 247 ($key, $value) = split; 248 next unless $key; 249 $value =~ s/^\s*\"(.*)\"\s*$/\1/; 250 if ($field = $header_field{$key}) { 251 $fields[$field] = {key => "$key", value => "$value"}; 252 next; 253 } 254 #print "$key -> $value\n"; 255 unless ($id = $picoknow_kb_id{$key}) { 256 print "*** error: not a valid knowledge base name $key\n"; 257 die "not a valid knowledge base name $key" ; 258 } 259 push @kb, {name => $key, file => $value, id => $id}; 260} 261close IN; 262 263 264 265#open output lingware file and write header 266 267unless (open OUT, ">$out") { 268 print "*** error: can't open $out for writing\n"; 269 die "can't open $out for writing"; 270} 271binmode(OUT); 272 273$offs = 0; 274 275################################################################### 276## 277## A) PICO HEADER 278## 279################################################################### 280# 1. SVOX HEADER 281foreach $ch (split //, $svoxheader) { 282 push @svoxheader, chr(ord($ch)-ord(' ')); 283} 284$offs += print_offs(@svoxheader); 285 286print "offset after svoxheader: $offs\n"; 287 288# fields header 289$fieldheader = ""; 290foreach $field (@fields) { 291 $fieldheader .= $field->{key} . " " . $field->{value} . " "; 292} 293 294#print size of fields header 295 296# length of the fields plus preceding number of fields (1) 297$len = length($fieldheader)+1; 298 299#fill should make the whole header 4-byte aligned, i.e. the current offs 300# (svoxheader) plus the length of the header (2) plus the fields 301$fill = ($offs + 2 + $len) % 4; 302$fill = $fill ? 4-$fill : 0; 303$len += $fill; 304 305print "filled length of header : $len\n"; 306 307# 2. length of header 308$offs += &print_uint16($len); #write little-endian 16-bit cardinal 309print "offset after length of header: $offs\n"; 310 311# 3. print number of fields 312$offs += &print_uint8(@fields+0); 313print "offset after number of fields: $offs\n"; 314 315# 4. print fields 316$offs += print_offs($fieldheader); 317print "offset after fields: $offs\n"; 318 319# 5. print magic number (not yet) 320 321# 6. print filler 322$offs += &opt_fill($fill); 323print "offset after fill: $offs\n"; 324 325 326################################################################### 327## 328## CONTENT (that is actually saved in RAM) 329## 330################################################################### 331 332 333 334# open kb files to get sizes and calculate fillers 335 336foreach $kb (@kb) { 337 if ($kb->{file}) { 338 unless (open IN, $kb->{file}) { 339 print "*** error: can't open " . $kb->{file} ."\n"; 340 die("can't open PKB " . $kb->{file}); 341 } 342 binmode(IN); 343 #slurp in the whole file 344 @content = <IN>; 345 close IN; 346 $kb->{size} = length(join '',@content); 347 $fill = ($kb->{size} % 4); 348 $kb->{fill} = $fill ? 4-$fill : 0; 349 print "fill of ", $kb->{name} , " is ", $kb->{fill}+0, "\n"; 350 $totalcont += $kb->{size} + $kb->{fill}; 351 } else { 352 $kb->{size} = 0; 353 } 354 print $kb->{name}, " -> ", $kb->{file}, " -> ", $kb->{size}, "\n"; 355} 356 357 358# calculate total content size (B): 359 360$totalcont = 0; 361 362# size of number of kbs (7.) 363$totalcont += 1; 364 365#size of names of kbs (8.) 366foreach $kb (@kb) { 367 $totalcont += &size_offs($kb->{name}, " "); 368} 369 370 371# size of directory (9.) 372 373$totalcont += 9 * @kb; 374 375# size of filler (10.) 376$xfill = $totalcont % 4; 377$xfill = $xfill ? 4 - $xfill : 0; 378$totalcont += $xfill; 379 380#set root of first kb 381$offs1 = $totalcont; 382 383print "totalcont before kbs: $totalcont\n"; 384# size of actual kbs 385foreach $kb (@kb) { 386 $totalcont += $kb->{size} + $kb->{fill}; 387} 388print "totalcont after kbs: $totalcont\n"; 389 390# B) print the total size 391 392$offs += &print_uint32($totalcont); 393 394print "offset after size of content (to be loaded): $offs\n"; 395 396# here starts the part that is stored in "permament memory" 397 398$offs = 0; 399 400print "offset after reset: $offs\n"; 401 402 403 404# 7. print number of kbs: 405$offs += &print_uint8(@kb+0); 406print "offset after number of kbs: $offs\n"; 407 408 409# 8. print names of kbs 410foreach $kb (@kb) { 411 $offs += &print_offs($kb->{name}, " "); 412} 413print "offset after descriptive kb names: $offs\n"; 414 415# 9. print directory (ids and offsets of kbs) 416 417 418print "first kb should start at $offs1\n"; 419foreach $kb (@kb) { 420 $offs += &print_uint8($kb->{id}); 421 print "kb id: $kb->{id}, offs=$offs\n"; 422 if ($kb->{size}) { 423 print "real kb (size $kb->{size})\n"; 424 $offs += &print_uint32($offs1); 425 print "kb offs: $offs1, offs=$offs\n"; 426 $offs += &print_uint32($kb->{size}); 427 print "kb size: $kb->{size}, offs=$offs\n"; 428 $offs1 += $kb->{size} + $kb->{fill}; 429 } else { 430 print "dummy kb (size 0)\n"; 431 $offs += &print_uint32(0); 432 print "kb offs: 0, offs=$offs\n"; 433 $offs += &print_uint32(0); 434 print "kb size: $kb->{size}, offs=$offs\n"; 435 } 436 print "offset after directory entry for kb $kb->{name} (id $kb->{id}): $offs\n"; 437} 438 439# 10. print filler 440$offs += &opt_fill($xfill); 441print "offset after fill: $offs\n"; 442 443 444# print kbs themselves 445 446foreach $kb (@kb) { 447 if ($kb->{file}) { 448 unless (open IN, $kb->{file}) { 449 print "*** error: can't open " . $kb->{file} ."\n"; 450 die "can't open " . $kb->{file}; 451 } 452 binmode(IN); 453 #slurp in the whole file 454 @content = <IN>; 455 close IN; 456 print OUT join '', @content; 457 $offs += $kb->{size}; 458 $offs += &opt_fill($kb->{fill}); 459 print "offset after kb $kb->{file}: $offs\n"; 460 } else { 461 } 462 #print $kb->{name}, " -> ", $kb->{file}, " -> ", $kb->{size}, "\n"; 463} 464 465 466close OUT; 467 468 469 470# optional filler 471# use for alignment if filler doesn't need to be parsed 472sub opt_fill() { 473 my ($fill) = @_; 474 my $size = 0; 475 if ($fill) { 476 $size += &print_uint8($fill); 477 for (my $i = 1; $i < $fill; $i++) { 478 $size += &print_uint8(0); 479 } 480 } 481 return $size; 482} 483 484# mandatory filler 485# use for alignment if filler needs to be parsed (at least one byte) 486sub mand_fill() { 487 my ($fill) = @_; 488 my $size = 0; 489 #force fill to be 4 if there is no fill 490 $fill = 4 unless $fill; 491 $size += &print_uint8($fill); 492 for (my $i = 1; $i < $fill; $i++) { 493 $size += &print_uint8(0); 494 } 495 return $size; 496} 497 498sub print_offs() { 499 my (@cont) = @_; 500 my $size = 0; 501 foreach my $cont (@cont) { 502 $size += length($cont); 503 print OUT $cont; 504 } 505 return $size; 506} 507 508sub size_offs() { 509 my (@cont) = @_; 510 my $size = 0; 511 foreach my $cont (@cont) { 512 $size += length($cont); 513 } 514 return $size; 515} 516 517sub print_uint_n() { 518 #little-endian n-byte cardinal (no check, though!) 519 my ($num, $n) = @_; 520 my (@out) = (); 521 for (my $i=0; $i < $n;$i++) { 522 $out[$i] = $num % 256; 523 $num = int($num / 256); 524 } 525 print OUT pack("c*",@out); 526 return $n; 527} 528 529sub print_uint32() { 530 #little-endian 4-byte cardinal (no check, though!) 531 my ($num) = @_; 532 return &print_uint_n($num,4); 533} 534 535sub print_uint16() { 536 #little-endian 2-byte cardinal (no check, though!) 537 my ($num) = @_; 538 return &print_uint_n($num,2); 539} 540sub print_uint8() { 541 my ($num) = @_; 542 return &print_uint_n($num,1); 543} 544 545sub numerically {$x < $y} 546 547# @x = map {chr(48+$_) } (0..9); 548# foreach $x (@x) { 549# print $x, "\n"; 550# } 551# @x = map +(chr(65+$_)), (0..9); 552# foreach $x (@x) { 553# print $x, "\n"; 554# } 555