1/* Determine a canonical name for the current locale's character encoding.
2
3   Copyright (C) 2000-2006, 2008-2012 Free Software Foundation, Inc.
4
5   This program is free software; you can redistribute it and/or modify
6   it under the terms of the GNU General Public License as published by
7   the Free Software Foundation; either version 3, or (at your option)
8   any later version.
9
10   This program is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   GNU General Public License for more details.
14
15   You should have received a copy of the GNU General Public License along
16   with this program; if not, see <http://www.gnu.org/licenses/>.  */
17
18/* Written by Bruno Haible <bruno@clisp.org>.  */
19
20#include <config.h>
21
22/* Specification.  */
23#include "localcharset.h"
24
25#include <fcntl.h>
26#include <stddef.h>
27#include <stdio.h>
28#include <string.h>
29#include <stdlib.h>
30
31#if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET
32# define DARWIN7 /* Darwin 7 or newer, i.e. Mac OS X 10.3 or newer */
33#endif
34
35#if defined _WIN32 || defined __WIN32__
36# define WINDOWS_NATIVE
37#endif
38
39#if defined __EMX__
40/* Assume EMX program runs on OS/2, even if compiled under DOS.  */
41# ifndef OS2
42#  define OS2
43# endif
44#endif
45
46#if !defined WINDOWS_NATIVE
47# include <unistd.h>
48# if HAVE_LANGINFO_CODESET
49#  include <langinfo.h>
50# else
51#  if 0 /* see comment below */
52#   include <locale.h>
53#  endif
54# endif
55# ifdef __CYGWIN__
56#  define WIN32_LEAN_AND_MEAN
57#  include <windows.h>
58# endif
59#elif defined WINDOWS_NATIVE
60# define WIN32_LEAN_AND_MEAN
61# include <windows.h>
62#endif
63#if defined OS2
64# define INCL_DOS
65# include <os2.h>
66#endif
67
68#if ENABLE_RELOCATABLE
69# include "relocatable.h"
70#else
71# define relocate(pathname) (pathname)
72#endif
73
74/* Get LIBDIR.  */
75#ifndef LIBDIR
76# include "configmake.h"
77#endif
78
79/* Define O_NOFOLLOW to 0 on platforms where it does not exist.  */
80#ifndef O_NOFOLLOW
81# define O_NOFOLLOW 0
82#endif
83
84#if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__
85  /* Native Windows, Cygwin, OS/2, DOS */
86# define ISSLASH(C) ((C) == '/' || (C) == '\\')
87#endif
88
89#ifndef DIRECTORY_SEPARATOR
90# define DIRECTORY_SEPARATOR '/'
91#endif
92
93#ifndef ISSLASH
94# define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR)
95#endif
96
97#if HAVE_DECL_GETC_UNLOCKED
98# undef getc
99# define getc getc_unlocked
100#endif
101
102/* The following static variable is declared 'volatile' to avoid a
103   possible multithread problem in the function get_charset_aliases. If we
104   are running in a threaded environment, and if two threads initialize
105   'charset_aliases' simultaneously, both will produce the same value,
106   and everything will be ok if the two assignments to 'charset_aliases'
107   are atomic. But I don't know what will happen if the two assignments mix.  */
108#if __STDC__ != 1
109# define volatile /* empty */
110#endif
111/* Pointer to the contents of the charset.alias file, if it has already been
112   read, else NULL.  Its format is:
113   ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0'  */
114static const char * volatile charset_aliases;
115
116/* Return a pointer to the contents of the charset.alias file.  */
117static const char *
118get_charset_aliases (void)
119{
120  const char *cp;
121
122  cp = charset_aliases;
123  if (cp == NULL)
124    {
125#if !(defined DARWIN7 || defined VMS || defined WINDOWS_NATIVE || defined __CYGWIN__)
126      const char *dir;
127      const char *base = "charset.alias";
128      char *file_name;
129
130      /* Make it possible to override the charset.alias location.  This is
131         necessary for running the testsuite before "make install".  */
132      dir = getenv ("CHARSETALIASDIR");
133      if (dir == NULL || dir[0] == '\0')
134        dir = relocate (LIBDIR);
135
136      /* Concatenate dir and base into freshly allocated file_name.  */
137      {
138        size_t dir_len = strlen (dir);
139        size_t base_len = strlen (base);
140        int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1]));
141        file_name = (char *) malloc (dir_len + add_slash + base_len + 1);
142        if (file_name != NULL)
143          {
144            memcpy (file_name, dir, dir_len);
145            if (add_slash)
146              file_name[dir_len] = DIRECTORY_SEPARATOR;
147            memcpy (file_name + dir_len + add_slash, base, base_len + 1);
148          }
149      }
150
151      if (file_name == NULL)
152        /* Out of memory.  Treat the file as empty.  */
153        cp = "";
154      else
155        {
156          int fd;
157
158          /* Open the file.  Reject symbolic links on platforms that support
159             O_NOFOLLOW.  This is a security feature.  Without it, an attacker
160             could retrieve parts of the contents (namely, the tail of the
161             first line that starts with "* ") of an arbitrary file by placing
162             a symbolic link to that file under the name "charset.alias" in
163             some writable directory and defining the environment variable
164             CHARSETALIASDIR to point to that directory.  */
165          fd = open (file_name,
166                     O_RDONLY | (HAVE_WORKING_O_NOFOLLOW ? O_NOFOLLOW : 0));
167          if (fd < 0)
168            /* File not found.  Treat it as empty.  */
169            cp = "";
170          else
171            {
172              FILE *fp;
173
174              fp = fdopen (fd, "r");
175              if (fp == NULL)
176                {
177                  /* Out of memory.  Treat the file as empty.  */
178                  close (fd);
179                  cp = "";
180                }
181              else
182                {
183                  /* Parse the file's contents.  */
184                  char *res_ptr = NULL;
185                  size_t res_size = 0;
186
187                  for (;;)
188                    {
189                      int c;
190                      char buf1[50+1];
191                      char buf2[50+1];
192                      size_t l1, l2;
193                      char *old_res_ptr;
194
195                      c = getc (fp);
196                      if (c == EOF)
197                        break;
198                      if (c == '\n' || c == ' ' || c == '\t')
199                        continue;
200                      if (c == '#')
201                        {
202                          /* Skip comment, to end of line.  */
203                          do
204                            c = getc (fp);
205                          while (!(c == EOF || c == '\n'));
206                          if (c == EOF)
207                            break;
208                          continue;
209                        }
210                      ungetc (c, fp);
211                      if (fscanf (fp, "%50s %50s", buf1, buf2) < 2)
212                        break;
213                      l1 = strlen (buf1);
214                      l2 = strlen (buf2);
215                      old_res_ptr = res_ptr;
216                      if (res_size == 0)
217                        {
218                          res_size = l1 + 1 + l2 + 1;
219                          res_ptr = (char *) malloc (res_size + 1);
220                        }
221                      else
222                        {
223                          res_size += l1 + 1 + l2 + 1;
224                          res_ptr = (char *) realloc (res_ptr, res_size + 1);
225                        }
226                      if (res_ptr == NULL)
227                        {
228                          /* Out of memory. */
229                          res_size = 0;
230                          free (old_res_ptr);
231                          break;
232                        }
233                      strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1);
234                      strcpy (res_ptr + res_size - (l2 + 1), buf2);
235                    }
236                  fclose (fp);
237                  if (res_size == 0)
238                    cp = "";
239                  else
240                    {
241                      *(res_ptr + res_size) = '\0';
242                      cp = res_ptr;
243                    }
244                }
245            }
246
247          free (file_name);
248        }
249
250#else
251
252# if defined DARWIN7
253      /* To avoid the trouble of installing a file that is shared by many
254         GNU packages -- many packaging systems have problems with this --,
255         simply inline the aliases here.  */
256      cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
257           "ISO8859-2" "\0" "ISO-8859-2" "\0"
258           "ISO8859-4" "\0" "ISO-8859-4" "\0"
259           "ISO8859-5" "\0" "ISO-8859-5" "\0"
260           "ISO8859-7" "\0" "ISO-8859-7" "\0"
261           "ISO8859-9" "\0" "ISO-8859-9" "\0"
262           "ISO8859-13" "\0" "ISO-8859-13" "\0"
263           "ISO8859-15" "\0" "ISO-8859-15" "\0"
264           "KOI8-R" "\0" "KOI8-R" "\0"
265           "KOI8-U" "\0" "KOI8-U" "\0"
266           "CP866" "\0" "CP866" "\0"
267           "CP949" "\0" "CP949" "\0"
268           "CP1131" "\0" "CP1131" "\0"
269           "CP1251" "\0" "CP1251" "\0"
270           "eucCN" "\0" "GB2312" "\0"
271           "GB2312" "\0" "GB2312" "\0"
272           "eucJP" "\0" "EUC-JP" "\0"
273           "eucKR" "\0" "EUC-KR" "\0"
274           "Big5" "\0" "BIG5" "\0"
275           "Big5HKSCS" "\0" "BIG5-HKSCS" "\0"
276           "GBK" "\0" "GBK" "\0"
277           "GB18030" "\0" "GB18030" "\0"
278           "SJIS" "\0" "SHIFT_JIS" "\0"
279           "ARMSCII-8" "\0" "ARMSCII-8" "\0"
280           "PT154" "\0" "PT154" "\0"
281         /*"ISCII-DEV" "\0" "?" "\0"*/
282           "*" "\0" "UTF-8" "\0";
283# endif
284
285# if defined VMS
286      /* To avoid the troubles of an extra file charset.alias_vms in the
287         sources of many GNU packages, simply inline the aliases here.  */
288      /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
289         "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
290         section 10.7 "Handling Different Character Sets".  */
291      cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
292           "ISO8859-2" "\0" "ISO-8859-2" "\0"
293           "ISO8859-5" "\0" "ISO-8859-5" "\0"
294           "ISO8859-7" "\0" "ISO-8859-7" "\0"
295           "ISO8859-8" "\0" "ISO-8859-8" "\0"
296           "ISO8859-9" "\0" "ISO-8859-9" "\0"
297           /* Japanese */
298           "eucJP" "\0" "EUC-JP" "\0"
299           "SJIS" "\0" "SHIFT_JIS" "\0"
300           "DECKANJI" "\0" "DEC-KANJI" "\0"
301           "SDECKANJI" "\0" "EUC-JP" "\0"
302           /* Chinese */
303           "eucTW" "\0" "EUC-TW" "\0"
304           "DECHANYU" "\0" "DEC-HANYU" "\0"
305           "DECHANZI" "\0" "GB2312" "\0"
306           /* Korean */
307           "DECKOREAN" "\0" "EUC-KR" "\0";
308# endif
309
310# if defined WINDOWS_NATIVE || defined __CYGWIN__
311      /* To avoid the troubles of installing a separate file in the same
312         directory as the DLL and of retrieving the DLL's directory at
313         runtime, simply inline the aliases here.  */
314
315      cp = "CP936" "\0" "GBK" "\0"
316           "CP1361" "\0" "JOHAB" "\0"
317           "CP20127" "\0" "ASCII" "\0"
318           "CP20866" "\0" "KOI8-R" "\0"
319           "CP20936" "\0" "GB2312" "\0"
320           "CP21866" "\0" "KOI8-RU" "\0"
321           "CP28591" "\0" "ISO-8859-1" "\0"
322           "CP28592" "\0" "ISO-8859-2" "\0"
323           "CP28593" "\0" "ISO-8859-3" "\0"
324           "CP28594" "\0" "ISO-8859-4" "\0"
325           "CP28595" "\0" "ISO-8859-5" "\0"
326           "CP28596" "\0" "ISO-8859-6" "\0"
327           "CP28597" "\0" "ISO-8859-7" "\0"
328           "CP28598" "\0" "ISO-8859-8" "\0"
329           "CP28599" "\0" "ISO-8859-9" "\0"
330           "CP28605" "\0" "ISO-8859-15" "\0"
331           "CP38598" "\0" "ISO-8859-8" "\0"
332           "CP51932" "\0" "EUC-JP" "\0"
333           "CP51936" "\0" "GB2312" "\0"
334           "CP51949" "\0" "EUC-KR" "\0"
335           "CP51950" "\0" "EUC-TW" "\0"
336           "CP54936" "\0" "GB18030" "\0"
337           "CP65001" "\0" "UTF-8" "\0";
338# endif
339#endif
340
341      charset_aliases = cp;
342    }
343
344  return cp;
345}
346
347/* Determine the current locale's character encoding, and canonicalize it
348   into one of the canonical names listed in config.charset.
349   The result must not be freed; it is statically allocated.
350   If the canonical name cannot be determined, the result is a non-canonical
351   name.  */
352
353#ifdef STATIC
354STATIC
355#endif
356const char *
357locale_charset (void)
358{
359  const char *codeset;
360  const char *aliases;
361
362#if !(defined WINDOWS_NATIVE || defined OS2)
363
364# if HAVE_LANGINFO_CODESET
365
366  /* Most systems support nl_langinfo (CODESET) nowadays.  */
367  codeset = nl_langinfo (CODESET);
368
369#  ifdef __CYGWIN__
370  /* Cygwin < 1.7 does not have locales.  nl_langinfo (CODESET) always
371     returns "US-ASCII".  Return the suffix of the locale name from the
372     environment variables (if present) or the codepage as a number.  */
373  if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0)
374    {
375      const char *locale;
376      static char buf[2 + 10 + 1];
377
378      locale = getenv ("LC_ALL");
379      if (locale == NULL || locale[0] == '\0')
380        {
381          locale = getenv ("LC_CTYPE");
382          if (locale == NULL || locale[0] == '\0')
383            locale = getenv ("LANG");
384        }
385      if (locale != NULL && locale[0] != '\0')
386        {
387          /* If the locale name contains an encoding after the dot, return
388             it.  */
389          const char *dot = strchr (locale, '.');
390
391          if (dot != NULL)
392            {
393              const char *modifier;
394
395              dot++;
396              /* Look for the possible @... trailer and remove it, if any.  */
397              modifier = strchr (dot, '@');
398              if (modifier == NULL)
399                return dot;
400              if (modifier - dot < sizeof (buf))
401                {
402                  memcpy (buf, dot, modifier - dot);
403                  buf [modifier - dot] = '\0';
404                  return buf;
405                }
406            }
407        }
408
409      /* The Windows API has a function returning the locale's codepage as a
410         number: GetACP().  This encoding is used by Cygwin, unless the user
411         has set the environment variable CYGWIN=codepage:oem (which very few
412         people do).
413         Output directed to console windows needs to be converted (to
414         GetOEMCP() if the console is using a raster font, or to
415         GetConsoleOutputCP() if it is using a TrueType font).  Cygwin does
416         this conversion transparently (see winsup/cygwin/fhandler_console.cc),
417         converting to GetConsoleOutputCP().  This leads to correct results,
418         except when SetConsoleOutputCP has been called and a raster font is
419         in use.  */
420      sprintf (buf, "CP%u", GetACP ());
421      codeset = buf;
422    }
423#  endif
424
425# else
426
427  /* On old systems which lack it, use setlocale or getenv.  */
428  const char *locale = NULL;
429
430  /* But most old systems don't have a complete set of locales.  Some
431     (like SunOS 4 or DJGPP) have only the C locale.  Therefore we don't
432     use setlocale here; it would return "C" when it doesn't support the
433     locale name the user has set.  */
434#  if 0
435  locale = setlocale (LC_CTYPE, NULL);
436#  endif
437  if (locale == NULL || locale[0] == '\0')
438    {
439      locale = getenv ("LC_ALL");
440      if (locale == NULL || locale[0] == '\0')
441        {
442          locale = getenv ("LC_CTYPE");
443          if (locale == NULL || locale[0] == '\0')
444            locale = getenv ("LANG");
445        }
446    }
447
448  /* On some old systems, one used to set locale = "iso8859_1". On others,
449     you set it to "language_COUNTRY.charset". In any case, we resolve it
450     through the charset.alias file.  */
451  codeset = locale;
452
453# endif
454
455#elif defined WINDOWS_NATIVE
456
457  static char buf[2 + 10 + 1];
458
459  /* The Windows API has a function returning the locale's codepage as a
460     number: GetACP().
461     When the output goes to a console window, it needs to be provided in
462     GetOEMCP() encoding if the console is using a raster font, or in
463     GetConsoleOutputCP() encoding if it is using a TrueType font.
464     But in GUI programs and for output sent to files and pipes, GetACP()
465     encoding is the best bet.  */
466  sprintf (buf, "CP%u", GetACP ());
467  codeset = buf;
468
469#elif defined OS2
470
471  const char *locale;
472  static char buf[2 + 10 + 1];
473  ULONG cp[3];
474  ULONG cplen;
475
476  /* Allow user to override the codeset, as set in the operating system,
477     with standard language environment variables.  */
478  locale = getenv ("LC_ALL");
479  if (locale == NULL || locale[0] == '\0')
480    {
481      locale = getenv ("LC_CTYPE");
482      if (locale == NULL || locale[0] == '\0')
483        locale = getenv ("LANG");
484    }
485  if (locale != NULL && locale[0] != '\0')
486    {
487      /* If the locale name contains an encoding after the dot, return it.  */
488      const char *dot = strchr (locale, '.');
489
490      if (dot != NULL)
491        {
492          const char *modifier;
493
494          dot++;
495          /* Look for the possible @... trailer and remove it, if any.  */
496          modifier = strchr (dot, '@');
497          if (modifier == NULL)
498            return dot;
499          if (modifier - dot < sizeof (buf))
500            {
501              memcpy (buf, dot, modifier - dot);
502              buf [modifier - dot] = '\0';
503              return buf;
504            }
505        }
506
507      /* Resolve through the charset.alias file.  */
508      codeset = locale;
509    }
510  else
511    {
512      /* OS/2 has a function returning the locale's codepage as a number.  */
513      if (DosQueryCp (sizeof (cp), cp, &cplen))
514        codeset = "";
515      else
516        {
517          sprintf (buf, "CP%u", cp[0]);
518          codeset = buf;
519        }
520    }
521
522#endif
523
524  if (codeset == NULL)
525    /* The canonical name cannot be determined.  */
526    codeset = "";
527
528  /* Resolve alias. */
529  for (aliases = get_charset_aliases ();
530       *aliases != '\0';
531       aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
532    if (strcmp (codeset, aliases) == 0
533        || (aliases[0] == '*' && aliases[1] == '\0'))
534      {
535        codeset = aliases + strlen (aliases) + 1;
536        break;
537      }
538
539  /* Don't return an empty string.  GNU libc and GNU libiconv interpret
540     the empty string as denoting "the locale's character encoding",
541     thus GNU libiconv would call this function a second time.  */
542  if (codeset[0] == '\0')
543    codeset = "ASCII";
544
545#ifdef DARWIN7
546  /* Mac OS X sets MB_CUR_MAX to 1 when LC_ALL=C, and "UTF-8"
547     (the default codeset) does not work when MB_CUR_MAX is 1.  */
548  if (strcmp (codeset, "UTF-8") == 0 && MB_CUR_MAX <= 1)
549    codeset = "ASCII";
550#endif
551
552  return codeset;
553}
554