1#############################################################################
2# Perl script genlingware.pl --- composes a lingware resource from
3#                                pico knowledge base binary files (pkb)
4#                                according to given configuration
5#
6# Copyright (C) 2009 SVOX AG. All Rights Reserved.
7#
8# type perl genlingware.pl -help to get help
9#
10#############################################################################
11eval "exec perl -S \$0 \${1+\"\$@\"}"
12    if 0;
13
14$resource_structure = <<EOSTRUCT
15
16
17the resource file structure is as follows:
18------------------------------
191. optional foreign header (4-byte aligned), externally added
20------------------------------
21A) Pico header (4-byte aligned)
22 2. SVOX PICO header (signature)
23 3. length of header (2 byte, excluding length itself)
24 4. number of fields  (1 byte)
25 5. header fields (space separated) (key/value pairs)
26 6. filler (0-3)
27------------------------------
28B) length of content
297. length of the remaining content in 4-byte, excluding length itself
30------------------------------
31C) Index
32 8. summary: number of kbs
33 9. id names of kb (strings of max 15 chars plus closing space)
34 10. directory (kb id (1 byte), offset (4 bytes), size (4 bytes))
35 11. filler (0-3)
36------------------------------
37D) knowledge bases
3812. sequence of knowledge bases (byte arrays), each 4-byte aligned
39
40all numbers are little endian
41EOSTRUCT
42    ;
43
44
45
46
47
48###################################################################
49##
50##  Imports
51##
52###################################################################
53#use File::DosGlob 'glob';
54#use File::Copy;
55#use File::Path;
56#use File::Basename;
57#use Filehandle;
58#use Time::Local;
59use Getopt::Long;
60###################################################################
61##
62##  Default values
63##
64###################################################################
65$VALUE = 1;
66$NAME = "name";
67$DEST = ".";
68###################################################################
69##
70##  Options
71##
72###################################################################
73GetOptions(
74    "value=i" => \$VALUE,  # numeric
75    "name=s" => \$NAME,    # string
76    "help"    => \$HELP
77    );
78###################################################################
79##
80##  Help
81##
82###################################################################
83$help = <<EOHELP
84    $0 -- composes a lingware resource from pico knowledge base
85          binary files (pkb) according to given configuration
86
87  Usage:
88    $0 -help
89
90    print this help
91
92    $0  <config> <resource>
93
94    reads in configuration file <config> and creates resource <resource>
95
96  Arguments:
97    <config>   :  configuration file (input)
98    <resource> :  platform-independent resource file (output)
99
100
101  (For more details see the source of this script)
102EOHELP
103    ;
104die $help if $HELP;
105
106$config_example = <<EOCONFIGEXAMPLE
107
108Example of a config:
109
110-------------------------------------------------------------------
111\# collection of de-DE textana knowledge bases
112
113\# header fields:
114
115NAME                    de-DE_ta_1.0.0.0-0-2
116VERSION                 1.0.0.0-0-2
117DATE                    2009-01-15
118TIME                    17:00:00.000
119CONTENT_TYPE            TEXTANA
120
121
122\# pico knowledge bases:
123
124TPP_MAIN                \"../pkb/de-DE/de-DE_kpr.pkb\"
125TAB_GRAPHS              \"../pkb/de-DE/de-DE_ktab_graphs.pkb\"
126...
127
128-------------------------------------------------------------------
129
130for all recognized pkb tags, see %picoknow_kb_id below
131
132EOCONFIGEXAMPLE
133;
134
135
136###################################################################
137##
138##  Initialization
139##
140###################################################################
141$svoxheader = " (C) SVOX AG ";
142
143%header_field = (
144    "NAME"    => 1,
145    "VERSION" => 2,
146    "DATE"    => 3,
147    "TIME"    => 4,
148    "CONTENT_TYPE" => 5,
149);
150
151%picoknow_kb_id = (
152    NULL         => 0,
153#base
154    TAB_GRAPHS   => 2,
155    TAB_PHONES   => 3,
156    TAB_POS      => 4,
157  # FIXED_IDS     = 7,
158#dbg
159    DBG          => 8,
160
161#textana
162    TPP_MAIN     => 1,
163    LEX_MAIN     => 9,
164    DT_POSP      => 10,
165    DT_POSD      => 11,
166    DT_G2P       => 12,
167    FST_WPHO_1   => 13,
168    FST_WPHO_2   => 14,
169    FST_WPHO_3   => 15,
170    FST_WPHO_4   => 16,
171    FST_WPHO_5   => 17,
172    DT_PHR       => 18,
173    DT_ACC       => 19,
174    FST_SPHO_1   => 20,
175    FST_SPHO_2   => 21,
176    FST_SPHO_3   => 22,
177    FST_SPHO_4   => 23,
178    FST_SPHO_5   => 24,
179    FST_XSPA_PARSE   => 25,
180    FST_SVPA_PARSE   => 26,
181    FST_XS2SVPA   => 27,
182
183    FST_SPHO_6   => 28,
184    FST_SPHO_7   => 29,
185    FST_SPHO_8   => 30,
186    FST_SPHO_9   => 31,
187    FST_SPHO_10   => 32,
188
189#siggen
190    DT_DUR       => 34,
191    DT_LFZ1      => 35,
192    DT_LFZ2      => 36,
193    DT_LFZ3      => 37,
194    DT_LFZ4      => 38,
195    DT_LFZ5      => 39,
196    DT_MGC1      => 40,
197    DT_MGC2      => 41,
198    DT_MGC3      => 42,
199    DT_MGC4      => 43,
200    DT_MGC5      => 44,
201    PDF_DUR      => 45,
202    PDF_LFZ      => 46,
203    PDF_MGC      => 47,
204    PDF_PHS      => 48,
205
206#user tpp
207    TPP_USER1    => 49,
208    TPP_USER2    => 50,
209#user lex
210    LEX_USER1    => 57,
211    LEX_USER2    => 58,
212
213    DUMMY => 127
214    );
215
216
217
218
219###################################################################
220##
221##  Get Parameters
222##
223###################################################################
224($in,$out) = (shift,shift);
225
226unless ($in && $out) {
227    print "*** error: incorrect number of parameters\n";
228    die $help;
229}
230
231
232###################################################################
233##
234##  Work
235##
236###################################################################
237
238#get description of kbs
239
240unless (open IN, $in) {
241    print "*** error: can't open $in\n";
242    die "can't open $in";
243}
244while (<IN>) {
245    # skip comments
246    next if /^\s*#/;
247    ($key, $value) = split;
248    next unless $key;
249    $value =~ s/^\s*\"(.*)\"\s*$/\1/;
250    if ($field = $header_field{$key}) {
251        $fields[$field] = {key => "$key", value => "$value"};
252        next;
253    }
254    #print "$key -> $value\n";
255    unless ($id = $picoknow_kb_id{$key}) {
256	print "*** error: not a valid knowledge base name $key\n";
257	die "not a valid knowledge base name $key" ;
258    }
259    push @kb, {name => $key, file => $value, id => $id};
260}
261close IN;
262
263
264
265#open output lingware file and write header
266
267unless (open OUT, ">$out") {
268    print "*** error: can't open $out for writing\n";
269    die "can't open $out for writing";
270}
271binmode(OUT);
272
273$offs = 0;
274
275###################################################################
276##
277##  A) PICO HEADER
278##
279###################################################################
280# 1. SVOX HEADER
281foreach $ch (split //, $svoxheader) {
282    push @svoxheader, chr(ord($ch)-ord(' '));
283}
284$offs += print_offs(@svoxheader);
285
286print "offset after svoxheader: $offs\n";
287
288# fields header
289$fieldheader = "";
290foreach $field (@fields) {
291    $fieldheader .= $field->{key} .  " " . $field->{value} . " ";
292}
293
294#print size of fields header
295
296# length of the fields plus preceding number of fields (1)
297$len = length($fieldheader)+1;
298
299#fill should make the whole header 4-byte aligned, i.e. the current offs
300# (svoxheader) plus the length of the header (2) plus the fields
301$fill = ($offs + 2 + $len) % 4;
302$fill = $fill ? 4-$fill : 0;
303$len += $fill;
304
305print "filled length of header : $len\n";
306
307# 2. length of header
308$offs += &print_uint16($len); #write little-endian 16-bit cardinal
309print "offset after length of header: $offs\n";
310
311# 3. print number of fields
312$offs += &print_uint8(@fields+0);
313print "offset after number of fields: $offs\n";
314
315# 4. print fields
316$offs += print_offs($fieldheader);
317print "offset after fields: $offs\n";
318
319# 5. print magic number (not yet)
320
321# 6. print filler
322$offs += &opt_fill($fill);
323print "offset after fill: $offs\n";
324
325
326###################################################################
327##
328##  CONTENT (that is actually saved in RAM)
329##
330###################################################################
331
332
333
334# open kb files to get sizes and calculate fillers
335
336foreach $kb (@kb) {
337    if ($kb->{file}) {
338	unless (open IN, $kb->{file}) {
339	    print "*** error: can't open " . $kb->{file} ."\n";
340	    die("can't open PKB " . $kb->{file});
341	}
342	binmode(IN);
343	#slurp in the whole file
344	@content = <IN>;
345	close IN;
346	$kb->{size} = length(join '',@content);
347	$fill = ($kb->{size} % 4);
348	$kb->{fill} = $fill ? 4-$fill : 0;
349	print "fill of ", $kb->{name} , " is ", $kb->{fill}+0, "\n";
350	$totalcont += $kb->{size} + $kb->{fill};
351    } else {
352	$kb->{size} = 0;
353    }
354    print $kb->{name}, " -> ", $kb->{file}, " -> ", $kb->{size}, "\n";
355}
356
357
358# calculate total content size (B):
359
360$totalcont = 0;
361
362# size of number of kbs (7.)
363$totalcont += 1;
364
365#size of names of kbs (8.)
366foreach $kb (@kb) {
367    $totalcont += &size_offs($kb->{name}, " ");
368}
369
370
371# size of directory (9.)
372
373$totalcont += 9 * @kb;
374
375# size of filler (10.)
376$xfill = $totalcont % 4;
377$xfill = $xfill ? 4 - $xfill : 0;
378$totalcont += $xfill;
379
380#set root of first kb
381$offs1 = $totalcont;
382
383print "totalcont before kbs: $totalcont\n";
384# size of actual kbs
385foreach $kb (@kb) {
386    $totalcont += $kb->{size} + $kb->{fill};
387}
388print "totalcont after kbs: $totalcont\n";
389
390# B) print the total size
391
392$offs += &print_uint32($totalcont);
393
394print "offset after size of content (to be loaded): $offs\n";
395
396# here starts the part that is stored in "permament memory"
397
398$offs = 0;
399
400print "offset after reset: $offs\n";
401
402
403
404# 7. print number of kbs:
405$offs += &print_uint8(@kb+0);
406print "offset after number of kbs: $offs\n";
407
408
409# 8. print names of kbs
410foreach $kb (@kb) {
411    $offs += &print_offs($kb->{name}, " ");
412}
413print "offset after descriptive kb names: $offs\n";
414
415# 9. print directory (ids and offsets of kbs)
416
417
418print "first kb should start at $offs1\n";
419foreach $kb (@kb) {
420    $offs += &print_uint8($kb->{id});
421    print "kb id: $kb->{id}, offs=$offs\n";
422    if ($kb->{size}) {
423    print "real kb (size $kb->{size})\n";
424	$offs += &print_uint32($offs1);
425    print "kb offs: $offs1, offs=$offs\n";
426	$offs += &print_uint32($kb->{size});
427    print "kb size: $kb->{size}, offs=$offs\n";
428	$offs1 += $kb->{size} + $kb->{fill};
429    } else {
430    print "dummy kb (size 0)\n";
431	$offs += &print_uint32(0);
432    print "kb offs: 0, offs=$offs\n";
433	$offs += &print_uint32(0);
434    print "kb size: $kb->{size}, offs=$offs\n";
435    }
436    print "offset after directory entry for kb $kb->{name} (id $kb->{id}): $offs\n";
437}
438
439# 10. print filler
440$offs += &opt_fill($xfill);
441print "offset after fill: $offs\n";
442
443
444# print kbs themselves
445
446foreach $kb (@kb) {
447    if ($kb->{file}) {
448	unless (open IN, $kb->{file}) {
449	    print "*** error: can't open " . $kb->{file} ."\n";
450	    die "can't open " . $kb->{file};
451	}
452	binmode(IN);
453	#slurp in the whole file
454	@content = <IN>;
455	close IN;
456	print OUT join '', @content;
457	$offs +=  $kb->{size};
458	$offs += &opt_fill($kb->{fill});
459	print "offset after kb $kb->{file}: $offs\n";
460    } else {
461    }
462    #print $kb->{name}, " -> ", $kb->{file}, " -> ", $kb->{size}, "\n";
463}
464
465
466close OUT;
467
468
469
470# optional filler
471# use for alignment if filler doesn't need to be parsed
472sub opt_fill() {
473    my ($fill) = @_;
474    my $size = 0;
475    if ($fill) {
476	$size += &print_uint8($fill);
477	for (my $i = 1; $i < $fill; $i++) {
478	    $size += &print_uint8(0);
479	}
480    }
481    return $size;
482}
483
484# mandatory filler
485# use for alignment if filler needs to be parsed (at least one byte)
486sub mand_fill() {
487    my ($fill) = @_;
488    my $size = 0;
489    #force fill to be 4 if there is no fill
490    $fill = 4 unless $fill;
491    $size += &print_uint8($fill);
492    for (my $i = 1; $i < $fill; $i++) {
493	$size += &print_uint8(0);
494    }
495    return $size;
496}
497
498sub print_offs() {
499    my (@cont) = @_;
500    my $size = 0;
501    foreach my $cont (@cont) {
502	$size += length($cont);
503	print OUT $cont;
504    }
505    return $size;
506}
507
508sub size_offs() {
509    my (@cont) = @_;
510    my $size = 0;
511    foreach my $cont (@cont) {
512	$size += length($cont);
513    }
514    return $size;
515}
516
517sub print_uint_n() {
518    #little-endian n-byte cardinal (no check, though!)
519    my ($num, $n) = @_;
520    my (@out) = ();
521    for (my $i=0; $i < $n;$i++) {
522	$out[$i] = $num % 256;
523	$num = int($num / 256);
524    }
525    print OUT pack("c*",@out);
526    return $n;
527}
528
529sub print_uint32() {
530    #little-endian 4-byte cardinal (no check, though!)
531    my ($num) = @_;
532    return &print_uint_n($num,4);
533}
534
535sub print_uint16() {
536    #little-endian 2-byte cardinal (no check, though!)
537    my ($num) = @_;
538    return &print_uint_n($num,2);
539}
540sub print_uint8() {
541    my ($num) = @_;
542    return &print_uint_n($num,1);
543}
544
545sub numerically {$x < $y}
546
547# @x = map {chr(48+$_) } (0..9);
548# foreach $x (@x) {
549#     print $x, "\n";
550# }
551# @x = map +(chr(65+$_)), (0..9);
552# foreach $x (@x) {
553#     print $x, "\n";
554# }
555