1#!/usr/bin/perl -w
2
3# Copyright (C) 2006, 2007, 2009, 2010 Apple Inc. All rights reserved.
4#
5# Redistribution and use in source and binary forms, with or without
6# modification, are permitted provided that the following conditions
7# are met:
8#
9# 1.  Redistributions of source code must retain the above copyright
10#     notice, this list of conditions and the following disclaimer. 
11# 2.  Redistributions in binary form must reproduce the above copyright
12#     notice, this list of conditions and the following disclaimer in the
13#     documentation and/or other materials provided with the distribution. 
14# 3.  Neither the name of Apple Computer, Inc. ("Apple") nor the names of
15#     its contributors may be used to endorse or promote products derived
16#     from this software without specific prior written permission. 
17#
18# THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
19# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21# DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
22# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
25# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
29# This script is like the genstrings tool (minus most of the options) with these differences.
30#
31#    1) It uses the names UI_STRING and UI_STRING_WITH_KEY for the macros, rather than the macros
32#       from NSBundle.h, and doesn't support tables (although they would be easy to add).
33#    2) It supports UTF-8 in key strings (and hence uses "" strings rather than @"" strings;
34#       @"" strings only reliably support ASCII since they are decoded based on the system encoding
35#       at runtime, so give different results on US and Japanese systems for example).
36#    3) It looks for strings that are not marked for localization, using both macro names that are
37#       known to be used for debugging in Intrigue source code and an exceptions file.
38#    4) It finds the files to work on rather than taking them as parameters, and also uses a
39#       hardcoded location for both the output file and the exceptions file.
40#       It would have been nice to use the project to find the source files, but it's too hard to
41#       locate source files after parsing a .pbxproj file.
42
43# The exceptions file has a list of strings in quotes, filenames, and filename/string pairs separated by :.
44
45use strict;
46
47sub UnescapeHexSequence($);
48
49my %isDebugMacro = ( ASSERT_WITH_MESSAGE => 1, LOG_ERROR => 1, ERROR => 1, NSURL_ERROR => 1, FATAL => 1, LOG => 1, LOG_WARNING => 1, UI_STRING_LOCALIZE_LATER => 1, LPCTSTR_UI_STRING_LOCALIZE_LATER => 1, UNLOCALIZED_STRING => 1, UNLOCALIZED_LPCTSTR => 1, dprintf => 1, NSException => 1, NSLog => 1, printf => 1 );
50
51@ARGV >= 2 or die "Usage: extract-localizable-strings <exceptions file> <file to update> [ directory... ]\nDid you mean to run update-webkit-localizable-strings instead?\n";
52
53my $exceptionsFile = shift @ARGV;
54-f $exceptionsFile or die "Couldn't find exceptions file $exceptionsFile\n" unless $exceptionsFile eq "-";
55
56my $fileToUpdate = shift @ARGV;
57-f $fileToUpdate or die "Couldn't find file to update $fileToUpdate\n";
58
59my $warnAboutUnlocalizedStrings = $exceptionsFile ne "-";
60
61my @directories = ();
62my @directoriesToSkip = ();
63if (@ARGV < 1) {
64    push(@directories, ".");
65} else {
66    for my $dir (@ARGV) {
67        if ($dir =~ /^-(.*)$/) {
68            push @directoriesToSkip, $1;
69        } else {
70            push @directories, $dir;
71        }
72    }
73}
74
75my $sawError = 0;
76
77my $localizedCount = 0;
78my $keyCollisionCount = 0;
79my $notLocalizedCount = 0;
80my $NSLocalizeCount = 0;
81
82my %exception;
83my %usedException;
84
85if ($exceptionsFile ne "-" && open EXCEPTIONS, $exceptionsFile) {
86    while (<EXCEPTIONS>) {
87        chomp;
88        if (/^"([^\\"]|\\.)*"$/ or /^[-_\/\w.]+.(h|m|mm|c|cpp)$/ or /^[-_\/\w.]+.(h|m|mm|c|cpp):"([^\\"]|\\.)*"$/) {
89            if ($exception{$_}) {
90                print "$exceptionsFile:$.:exception for $_ appears twice\n";
91                print "$exceptionsFile:$exception{$_}:first appearance\n";
92            } else {
93                $exception{$_} = $.;
94            }
95        } else {
96            print "$exceptionsFile:$.:syntax error\n";
97        }
98    }
99    close EXCEPTIONS;
100}
101
102my $quotedDirectoriesString = '"' . join('" "', @directories) . '"';
103for my $dir (@directoriesToSkip) {
104    $quotedDirectoriesString .= ' -path "' . $dir . '" -prune -o';
105}
106
107my @files = ( split "\n", `find $quotedDirectoriesString \\( -name "*.h" -o -name "*.m" -o -name "*.mm" -o -name "*.c" -o -name "*.cpp" \\)` );
108
109for my $file (sort @files) {
110    next if $file =~ /\/\w+LocalizableStrings\w*\.h$/ || $file =~ /\/LocalizedStrings\.h$/;
111
112    $file =~ s-^./--;
113
114    open SOURCE, $file or die "can't open $file\n";
115    
116    my $inComment = 0;
117    
118    my $expected = "";
119    my $macroLine;
120    my $macro;
121    my $UIString;
122    my $key;
123    my $comment;
124    
125    my $string;
126    my $stringLine;
127    my $nestingLevel;
128    
129    my $previousToken = "";
130
131    while (<SOURCE>) {
132        chomp;
133        
134        # Handle continued multi-line comment.
135        if ($inComment) {
136            next unless s-.*\*/--;
137            $inComment = 0;
138        }
139    
140        # Handle all the tokens in the line.
141        while (s-^\s*([#\w]+|/\*|//|[^#\w/'"()\[\],]+|.)--) {
142            my $token = $1;
143            
144            if ($token eq "\"") {
145                if ($expected and $expected ne "a quoted string") {
146                    print "$file:$.:ERROR:found a quoted string but expected $expected\n";
147                    $sawError = 1;
148                    $expected = "";
149                }
150                if (s-^(([^\\$token]|\\.)*?)$token--) {
151                    if (!defined $string) {
152                        $stringLine = $.;
153                        $string = $1;
154                    } else {
155                        $string .= $1;
156                    }
157                } else {
158                    print "$file:$.:ERROR:mismatched quotes\n";
159                    $sawError = 1;
160                    $_ = "";
161                }
162                next;
163            }
164            
165            if (defined $string) {
166handleString:
167                if ($expected) {
168                    if (!defined $UIString) {
169                        # FIXME: Validate UTF-8 here?
170                        $UIString = $string;
171                        $expected = ",";
172                    } elsif (($macro =~ /(WEB_)?UI_STRING_KEY(_INTERNAL)?$/) and !defined $key) {
173                        # FIXME: Validate UTF-8 here?
174                        $key = $string;
175                        $expected = ",";
176                    } elsif (!defined $comment) {
177                        # FIXME: Validate UTF-8 here?
178                        $comment = $string;
179                        $expected = ")";
180                    }
181                } else {
182                    if (defined $nestingLevel) {
183                        # In a debug macro, no need to localize.
184                    } elsif ($previousToken eq "#include" or $previousToken eq "#import") {
185                        # File name, no need to localize.
186                    } elsif ($previousToken eq "extern" and $string eq "C") {
187                        # extern "C", no need to localize.
188                    } elsif ($string eq "") {
189                        # Empty string can sometimes be localized, but we need not complain if not.
190                    } elsif ($exception{$file}) {
191                        $usedException{$file} = 1;
192                    } elsif ($exception{"\"$string\""}) {
193                        $usedException{"\"$string\""} = 1;
194                    } elsif ($exception{"$file:\"$string\""}) {
195                        $usedException{"$file:\"$string\""} = 1;
196                    } else {
197                        print "$file:$stringLine:\"$string\" is not marked for localization\n" if $warnAboutUnlocalizedStrings;
198                        $notLocalizedCount++;
199                    }
200                }
201                $string = undef;
202                last if !defined $token;
203            }
204            
205            $previousToken = $token;
206
207            if ($token =~ /^NSLocalized/ && $token !~ /NSLocalizedDescriptionKey/ && $token !~ /NSLocalizedStringFromTableInBundle/) {
208                print "$file:$.:ERROR:found a use of an NSLocalized macro; not supported\n";
209                $nestingLevel = 0 if !defined $nestingLevel;
210                $sawError = 1;
211                $NSLocalizeCount++;
212            } elsif ($token eq "/*") {
213                if (!s-^.*?\*/--) {
214                    $_ = ""; # If the comment doesn't end, discard the result of the line and set flag
215                    $inComment = 1;
216                }
217            } elsif ($token eq "//") {
218                $_ = ""; # Discard the rest of the line
219            } elsif ($token eq "'") {
220                if (!s-([^\\]|\\.)'--) { #' <-- that single quote makes the Project Builder editor less confused
221                    print "$file:$.:ERROR:mismatched single quote\n";
222                    $sawError = 1;
223                    $_ = "";
224                }
225            } else {
226                if ($expected and $expected ne $token) {
227                    print "$file:$.:ERROR:found $token but expected $expected\n";
228                    $sawError = 1;
229                    $expected = "";
230                }
231                if ($token =~ /(WEB_)?UI_STRING(_KEY)?(_INTERNAL)?$/) {
232                    $expected = "(";
233                    $macro = $token;
234                    $UIString = undef;
235                    $key = undef;
236                    $comment = undef;
237                    $macroLine = $.;
238                } elsif ($token eq "(" or $token eq "[") {
239                    ++$nestingLevel if defined $nestingLevel;
240                    $expected = "a quoted string" if $expected;
241                } elsif ($token eq ",") {
242                    $expected = "a quoted string" if $expected;
243                } elsif ($token eq ")" or $token eq "]") {
244                    $nestingLevel = undef if defined $nestingLevel && !--$nestingLevel;
245                    if ($expected) {
246                        $key = $UIString if !defined $key;
247                        HandleUIString($UIString, $key, $comment, $file, $macroLine);
248                        $macro = "";
249                        $expected = "";
250                        $localizedCount++;
251                    }
252                } elsif ($isDebugMacro{$token}) {
253                    $nestingLevel = 0 if !defined $nestingLevel;
254                }
255            }
256        }
257            
258    }
259    
260    goto handleString if defined $string;
261    
262    if ($expected) {
263        print "$file:ERROR:reached end of file but expected $expected\n";
264        $sawError = 1;
265    }
266    
267    close SOURCE;
268}
269
270# Unescapes C language hexadecimal escape sequences.
271sub UnescapeHexSequence($)
272{
273    my ($originalStr) = @_;
274
275    my $escapedStr = $originalStr;
276    my $unescapedStr = "";
277
278    for (;;) {
279        if ($escapedStr =~ s-^\\x([[:xdigit:]]+)--) {
280            if (256 <= hex($1)) {
281                print "Hexadecimal escape sequence out of range: \\x$1\n";
282                return undef;
283            }
284            $unescapedStr .= pack("H*", $1);
285        } elsif ($escapedStr =~ s-^(.)--) {
286            $unescapedStr .= $1;
287        } else {
288            return $unescapedStr;
289        }
290    }
291}
292
293my %stringByKey;
294my %commentByKey;
295my %fileByKey;
296my %lineByKey;
297
298sub HandleUIString
299{
300    my ($string, $key, $comment, $file, $line) = @_;
301
302    my $bad = 0;
303    $string = UnescapeHexSequence($string);
304    if (!defined($string)) {
305        print "$file:$line:ERROR:string has an illegal hexadecimal escape sequence\n";
306        $bad = 1;
307    }
308    $key = UnescapeHexSequence($key);
309    if (!defined($key)) {
310        print "$file:$line:ERROR:key has an illegal hexadecimal escape sequence\n";
311        $bad = 1;
312    }
313    $comment = UnescapeHexSequence($comment);
314    if (!defined($comment)) {
315        print "$file:$line:ERROR:comment has an illegal hexadecimal escape sequence\n";
316        $bad = 1;
317    }
318    if (grep { $_ == 0xFFFD } unpack "U*", $string) {
319        print "$file:$line:ERROR:string for translation has illegal UTF-8 -- most likely a problem with the Text Encoding of the source file\n";
320        $bad = 1;
321    }
322    if ($string ne $key && grep { $_ == 0xFFFD } unpack "U*", $key) {
323        print "$file:$line:ERROR:key has illegal UTF-8 -- most likely a problem with the Text Encoding of the source file\n";
324        $bad = 1;
325    }
326    if (grep { $_ == 0xFFFD } unpack "U*", $comment) {
327        print "$file:$line:ERROR:comment for translation has illegal UTF-8 -- most likely a problem with the Text Encoding of the source file\n";
328        $bad = 1;
329    }
330    if ($bad) {
331        $sawError = 1;
332        return;
333    }
334    
335    if ($stringByKey{$key} && $stringByKey{$key} ne $string) {
336        print "$file:$line:encountered the same key, \"$key\", twice, with different strings\n";
337        print "$fileByKey{$key}:$lineByKey{$key}:previous occurrence\n";
338        $keyCollisionCount++;
339        return;
340    }
341    if ($commentByKey{$key} && $commentByKey{$key} ne $comment) {
342        print "$file:$line:encountered the same key, \"$key\", twice, with different comments\n";
343        print "$fileByKey{$key}:$lineByKey{$key}:previous occurrence\n";
344        $keyCollisionCount++;
345        return;
346    }
347
348    $fileByKey{$key} = $file;
349    $lineByKey{$key} = $line;
350    $stringByKey{$key} = $string;
351    $commentByKey{$key} = $comment;
352}
353
354print "\n" if $sawError || $notLocalizedCount || $NSLocalizeCount;
355
356my @unusedExceptions = sort grep { !$usedException{$_} } keys %exception;
357if (@unusedExceptions) {
358    for my $unused (@unusedExceptions) {
359        print "$exceptionsFile:$exception{$unused}:exception $unused not used\n";
360    }
361    print "\n";
362}
363
364print "$localizedCount localizable strings\n" if $localizedCount;
365print "$keyCollisionCount key collisions\n" if $keyCollisionCount;
366print "$notLocalizedCount strings not marked for localization\n" if $notLocalizedCount;
367print "$NSLocalizeCount uses of NSLocalize\n" if $NSLocalizeCount;
368print scalar(@unusedExceptions), " unused exceptions\n" if @unusedExceptions;
369
370if ($sawError) {
371    print "\nErrors encountered. Exiting without writing to $fileToUpdate.\n";
372    exit 1;
373}
374
375my $localizedStrings = "";
376
377for my $key (sort keys %commentByKey) {
378    $localizedStrings .= "/* $commentByKey{$key} */\n\"$key\" = \"$stringByKey{$key}\";\n\n";
379}
380
381# Write out the strings file in UTF-16 with a BOM.
382utf8::decode($localizedStrings) if $^V ge v5.8;
383my $output = pack "n*", (0xFEFF, unpack "U*", $localizedStrings);
384
385if (-e "$fileToUpdate") {
386    open STRINGS, ">", "$fileToUpdate" or die;
387    print STRINGS $output;
388    close STRINGS;
389} else {
390    print "$fileToUpdate does not exist\n";
391    exit 1;
392}
393