1/* quotearg.c - quote arguments for output
2
3   Copyright (C) 1998, 1999, 2000, 2001, 2002, 2004, 2005, 2006 Free
4   Software Foundation, Inc.
5
6   This program is free software; you can redistribute it and/or modify
7   it under the terms of the GNU General Public License as published by
8   the Free Software Foundation; either version 2, or (at your option)
9   any later version.
10
11   This program is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14   GNU General Public License for more details.
15
16   You should have received a copy of the GNU General Public License
17   along with this program; if not, write to the Free Software Foundation,
18   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
19
20/* Written by Paul Eggert <eggert@twinsun.com> */
21
22#ifdef HAVE_CONFIG_H
23# include <config.h>
24#endif
25
26#include "quotearg.h"
27
28#include "xalloc.h"
29
30#include <ctype.h>
31#include <errno.h>
32#include <limits.h>
33#include <stdbool.h>
34#include <stdlib.h>
35#include <string.h>
36
37#include "gettext.h"
38#define _(msgid) gettext (msgid)
39#define N_(msgid) msgid
40
41#if HAVE_WCHAR_H
42
43/* BSD/OS 4.1 wchar.h requires FILE and struct tm to be declared.  */
44# include <stdio.h>
45# include <time.h>
46
47# include <wchar.h>
48#endif
49
50#if !HAVE_MBRTOWC
51/* Disable multibyte processing entirely.  Since MB_CUR_MAX is 1, the
52   other macros are defined only for documentation and to satisfy C
53   syntax.  */
54# undef MB_CUR_MAX
55# define MB_CUR_MAX 1
56# define mbrtowc(pwc, s, n, ps) ((*(pwc) = *(s)) != 0)
57# define iswprint(wc) isprint ((unsigned char) (wc))
58# undef HAVE_MBSINIT
59#endif
60
61#if !defined mbsinit && !HAVE_MBSINIT
62# define mbsinit(ps) 1
63#endif
64
65#ifndef iswprint
66# if HAVE_WCTYPE_H
67#  include <wctype.h>
68# endif
69# if !defined iswprint && !HAVE_ISWPRINT
70#  define iswprint(wc) 1
71# endif
72#endif
73
74#ifndef SIZE_MAX
75# define SIZE_MAX ((size_t) -1)
76#endif
77
78#define INT_BITS (sizeof (int) * CHAR_BIT)
79
80struct quoting_options
81{
82  /* Basic quoting style.  */
83  enum quoting_style style;
84
85  /* Quote the characters indicated by this bit vector even if the
86     quoting style would not normally require them to be quoted.  */
87  unsigned int quote_these_too[(UCHAR_MAX / INT_BITS) + 1];
88};
89
90/* Names of quoting styles.  */
91char const *const quoting_style_args[] =
92{
93  "literal",
94  "shell",
95  "shell-always",
96  "c",
97  "escape",
98  "locale",
99  "clocale",
100  0
101};
102
103/* Correspondences to quoting style names.  */
104enum quoting_style const quoting_style_vals[] =
105{
106  literal_quoting_style,
107  shell_quoting_style,
108  shell_always_quoting_style,
109  c_quoting_style,
110  escape_quoting_style,
111  locale_quoting_style,
112  clocale_quoting_style
113};
114
115/* The default quoting options.  */
116static struct quoting_options default_quoting_options;
117
118/* Allocate a new set of quoting options, with contents initially identical
119   to O if O is not null, or to the default if O is null.
120   It is the caller's responsibility to free the result.  */
121struct quoting_options *
122clone_quoting_options (struct quoting_options *o)
123{
124  int e = errno;
125  struct quoting_options *p = xmalloc (sizeof *p);
126  *p = *(o ? o : &default_quoting_options);
127  errno = e;
128  return p;
129}
130
131/* Get the value of O's quoting style.  If O is null, use the default.  */
132enum quoting_style
133get_quoting_style (struct quoting_options *o)
134{
135  return (o ? o : &default_quoting_options)->style;
136}
137
138/* In O (or in the default if O is null),
139   set the value of the quoting style to S.  */
140void
141set_quoting_style (struct quoting_options *o, enum quoting_style s)
142{
143  (o ? o : &default_quoting_options)->style = s;
144}
145
146/* In O (or in the default if O is null),
147   set the value of the quoting options for character C to I.
148   Return the old value.  Currently, the only values defined for I are
149   0 (the default) and 1 (which means to quote the character even if
150   it would not otherwise be quoted).  */
151int
152set_char_quoting (struct quoting_options *o, char c, int i)
153{
154  unsigned char uc = c;
155  unsigned int *p =
156    (o ? o : &default_quoting_options)->quote_these_too + uc / INT_BITS;
157  int shift = uc % INT_BITS;
158  int r = (*p >> shift) & 1;
159  *p ^= ((i & 1) ^ r) << shift;
160  return r;
161}
162
163/* MSGID approximates a quotation mark.  Return its translation if it
164   has one; otherwise, return either it or "\"", depending on S.  */
165static char const *
166gettext_quote (char const *msgid, enum quoting_style s)
167{
168  char const *translation = _(msgid);
169  if (translation == msgid && s == clocale_quoting_style)
170    translation = "\"";
171  return translation;
172}
173
174/* Place into buffer BUFFER (of size BUFFERSIZE) a quoted version of
175   argument ARG (of size ARGSIZE), using QUOTING_STYLE and the
176   non-quoting-style part of O to control quoting.
177   Terminate the output with a null character, and return the written
178   size of the output, not counting the terminating null.
179   If BUFFERSIZE is too small to store the output string, return the
180   value that would have been returned had BUFFERSIZE been large enough.
181   If ARGSIZE is SIZE_MAX, use the string length of the argument for ARGSIZE.
182
183   This function acts like quotearg_buffer (BUFFER, BUFFERSIZE, ARG,
184   ARGSIZE, O), except it uses QUOTING_STYLE instead of the quoting
185   style specified by O, and O may not be null.  */
186
187static size_t
188quotearg_buffer_restyled (char *buffer, size_t buffersize,
189			  char const *arg, size_t argsize,
190			  enum quoting_style quoting_style,
191			  struct quoting_options const *o)
192{
193  size_t i;
194  size_t len = 0;
195  char const *quote_string = 0;
196  size_t quote_string_len = 0;
197  bool backslash_escapes = false;
198  bool unibyte_locale = MB_CUR_MAX == 1;
199
200#define STORE(c) \
201    do \
202      { \
203	if (len < buffersize) \
204	  buffer[len] = (c); \
205	len++; \
206      } \
207    while (0)
208
209  switch (quoting_style)
210    {
211    case c_quoting_style:
212      STORE ('"');
213      backslash_escapes = true;
214      quote_string = "\"";
215      quote_string_len = 1;
216      break;
217
218    case escape_quoting_style:
219      backslash_escapes = true;
220      break;
221
222    case locale_quoting_style:
223    case clocale_quoting_style:
224      {
225	/* TRANSLATORS:
226	   Get translations for open and closing quotation marks.
227
228	   The message catalog should translate "`" to a left
229	   quotation mark suitable for the locale, and similarly for
230	   "'".  If the catalog has no translation,
231	   locale_quoting_style quotes `like this', and
232	   clocale_quoting_style quotes "like this".
233
234	   For example, an American English Unicode locale should
235	   translate "`" to U+201C (LEFT DOUBLE QUOTATION MARK), and
236	   should translate "'" to U+201D (RIGHT DOUBLE QUOTATION
237	   MARK).  A British English Unicode locale should instead
238	   translate these to U+2018 (LEFT SINGLE QUOTATION MARK) and
239	   U+2019 (RIGHT SINGLE QUOTATION MARK), respectively.
240
241	   If you don't know what to put here, please see
242	   <http://en.wikipedia.org/wiki/Quotation_mark#Glyphs>
243	   and use glyphs suitable for your language.  */
244
245	char const *left = gettext_quote (N_("`"), quoting_style);
246	char const *right = gettext_quote (N_("'"), quoting_style);
247	for (quote_string = left; *quote_string; quote_string++)
248	  STORE (*quote_string);
249	backslash_escapes = true;
250	quote_string = right;
251	quote_string_len = strlen (quote_string);
252      }
253      break;
254
255    case shell_always_quoting_style:
256      STORE ('\'');
257      quote_string = "'";
258      quote_string_len = 1;
259      break;
260
261    default:
262      break;
263    }
264
265  for (i = 0;  ! (argsize == SIZE_MAX ? arg[i] == '\0' : i == argsize);  i++)
266    {
267      unsigned char c;
268      unsigned char esc;
269
270      if (backslash_escapes
271	  && quote_string_len
272	  && i + quote_string_len <= argsize
273	  && memcmp (arg + i, quote_string, quote_string_len) == 0)
274	STORE ('\\');
275
276      c = arg[i];
277      switch (c)
278	{
279	case '\0':
280	  if (backslash_escapes)
281	    {
282	      STORE ('\\');
283	      STORE ('0');
284	      STORE ('0');
285	      c = '0';
286	    }
287	  break;
288
289	case '?':
290	  switch (quoting_style)
291	    {
292	    case shell_quoting_style:
293	      goto use_shell_always_quoting_style;
294
295	    case c_quoting_style:
296	      if (i + 2 < argsize && arg[i + 1] == '?')
297		switch (arg[i + 2])
298		  {
299		  case '!': case '\'':
300		  case '(': case ')': case '-': case '/':
301		  case '<': case '=': case '>':
302		    /* Escape the second '?' in what would otherwise be
303		       a trigraph.  */
304		    c = arg[i + 2];
305		    i += 2;
306		    STORE ('?');
307		    STORE ('\\');
308		    STORE ('?');
309		    break;
310
311		  default:
312		    break;
313		  }
314	      break;
315
316	    default:
317	      break;
318	    }
319	  break;
320
321	case '\a': esc = 'a'; goto c_escape;
322	case '\b': esc = 'b'; goto c_escape;
323	case '\f': esc = 'f'; goto c_escape;
324	case '\n': esc = 'n'; goto c_and_shell_escape;
325	case '\r': esc = 'r'; goto c_and_shell_escape;
326	case '\t': esc = 't'; goto c_and_shell_escape;
327	case '\v': esc = 'v'; goto c_escape;
328	case '\\': esc = c; goto c_and_shell_escape;
329
330	c_and_shell_escape:
331	  if (quoting_style == shell_quoting_style)
332	    goto use_shell_always_quoting_style;
333	c_escape:
334	  if (backslash_escapes)
335	    {
336	      c = esc;
337	      goto store_escape;
338	    }
339	  break;
340
341	case '{': case '}': /* sometimes special if isolated */
342	  if (! (argsize == SIZE_MAX ? arg[1] == '\0' : argsize == 1))
343	    break;
344	  /* Fall through.  */
345	case '#': case '~':
346	  if (i != 0)
347	    break;
348	  /* Fall through.  */
349	case ' ':
350	case '!': /* special in bash */
351	case '"': case '$': case '&':
352	case '(': case ')': case '*': case ';':
353	case '<':
354	case '=': /* sometimes special in 0th or (with "set -k") later args */
355	case '>': case '[':
356	case '^': /* special in old /bin/sh, e.g. SunOS 4.1.4 */
357	case '`': case '|':
358	  /* A shell special character.  In theory, '$' and '`' could
359	     be the first bytes of multibyte characters, which means
360	     we should check them with mbrtowc, but in practice this
361	     doesn't happen so it's not worth worrying about.  */
362	  if (quoting_style == shell_quoting_style)
363	    goto use_shell_always_quoting_style;
364	  break;
365
366	case '\'':
367	  switch (quoting_style)
368	    {
369	    case shell_quoting_style:
370	      goto use_shell_always_quoting_style;
371
372	    case shell_always_quoting_style:
373	      STORE ('\'');
374	      STORE ('\\');
375	      STORE ('\'');
376	      break;
377
378	    default:
379	      break;
380	    }
381	  break;
382
383	case '%': case '+': case ',': case '-': case '.': case '/':
384	case '0': case '1': case '2': case '3': case '4': case '5':
385	case '6': case '7': case '8': case '9': case ':':
386	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
387	case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
388	case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
389	case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
390	case 'Y': case 'Z': case ']': case '_': case 'a': case 'b':
391	case 'c': case 'd': case 'e': case 'f': case 'g': case 'h':
392	case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
393	case 'o': case 'p': case 'q': case 'r': case 's': case 't':
394	case 'u': case 'v': case 'w': case 'x': case 'y': case 'z':
395	  /* These characters don't cause problems, no matter what the
396	     quoting style is.  They cannot start multibyte sequences.  */
397	  break;
398
399	default:
400	  /* If we have a multibyte sequence, copy it until we reach
401	     its end, find an error, or come back to the initial shift
402	     state.  For C-like styles, if the sequence has
403	     unprintable characters, escape the whole sequence, since
404	     we can't easily escape single characters within it.  */
405	  {
406	    /* Length of multibyte sequence found so far.  */
407	    size_t m;
408
409	    bool printable;
410
411	    if (unibyte_locale)
412	      {
413		m = 1;
414		printable = isprint (c) != 0;
415	      }
416	    else
417	      {
418		mbstate_t mbstate;
419		memset (&mbstate, 0, sizeof mbstate);
420
421		m = 0;
422		printable = true;
423		if (argsize == SIZE_MAX)
424		  argsize = strlen (arg);
425
426		do
427		  {
428		    wchar_t w;
429		    size_t bytes = mbrtowc (&w, &arg[i + m],
430					    argsize - (i + m), &mbstate);
431		    if (bytes == 0)
432		      break;
433		    else if (bytes == (size_t) -1)
434		      {
435			printable = false;
436			break;
437		      }
438		    else if (bytes == (size_t) -2)
439		      {
440			printable = false;
441			while (i + m < argsize && arg[i + m])
442			  m++;
443			break;
444		      }
445		    else
446		      {
447			/* Work around a bug with older shells that "see" a '\'
448			   that is really the 2nd byte of a multibyte character.
449			   In practice the problem is limited to ASCII
450			   chars >= '@' that are shell special chars.  */
451			if ('[' == 0x5b && quoting_style == shell_quoting_style)
452			  {
453			    size_t j;
454			    for (j = 1; j < bytes; j++)
455			      switch (arg[i + m + j])
456				{
457				case '[': case '\\': case '^':
458				case '`': case '|':
459				  goto use_shell_always_quoting_style;
460
461				default:
462				  break;
463				}
464			  }
465
466			if (! iswprint (w))
467			  printable = false;
468			m += bytes;
469		      }
470		  }
471		while (! mbsinit (&mbstate));
472	      }
473
474	    if (1 < m || (backslash_escapes && ! printable))
475	      {
476		/* Output a multibyte sequence, or an escaped
477		   unprintable unibyte character.  */
478		size_t ilim = i + m;
479
480		for (;;)
481		  {
482		    if (backslash_escapes && ! printable)
483		      {
484			STORE ('\\');
485			STORE ('0' + (c >> 6));
486			STORE ('0' + ((c >> 3) & 7));
487			c = '0' + (c & 7);
488		      }
489		    if (ilim <= i + 1)
490		      break;
491		    STORE (c);
492		    c = arg[++i];
493		  }
494
495		goto store_c;
496	      }
497	  }
498	}
499
500      if (! (backslash_escapes
501	     && o->quote_these_too[c / INT_BITS] & (1 << (c % INT_BITS))))
502	goto store_c;
503
504    store_escape:
505      STORE ('\\');
506
507    store_c:
508      STORE (c);
509    }
510
511  if (i == 0 && quoting_style == shell_quoting_style)
512    goto use_shell_always_quoting_style;
513
514  if (quote_string)
515    for (; *quote_string; quote_string++)
516      STORE (*quote_string);
517
518  if (len < buffersize)
519    buffer[len] = '\0';
520  return len;
521
522 use_shell_always_quoting_style:
523  return quotearg_buffer_restyled (buffer, buffersize, arg, argsize,
524				   shell_always_quoting_style, o);
525}
526
527/* Place into buffer BUFFER (of size BUFFERSIZE) a quoted version of
528   argument ARG (of size ARGSIZE), using O to control quoting.
529   If O is null, use the default.
530   Terminate the output with a null character, and return the written
531   size of the output, not counting the terminating null.
532   If BUFFERSIZE is too small to store the output string, return the
533   value that would have been returned had BUFFERSIZE been large enough.
534   If ARGSIZE is SIZE_MAX, use the string length of the argument for
535   ARGSIZE.  */
536size_t
537quotearg_buffer (char *buffer, size_t buffersize,
538		 char const *arg, size_t argsize,
539		 struct quoting_options const *o)
540{
541  struct quoting_options const *p = o ? o : &default_quoting_options;
542  int e = errno;
543  size_t r = quotearg_buffer_restyled (buffer, buffersize, arg, argsize,
544				       p->style, p);
545  errno = e;
546  return r;
547}
548
549/* Like quotearg_buffer (..., ARG, ARGSIZE, O), except return newly
550   allocated storage containing the quoted string.  */
551char *
552quotearg_alloc (char const *arg, size_t argsize,
553		struct quoting_options const *o)
554{
555  int e = errno;
556  size_t bufsize = quotearg_buffer (0, 0, arg, argsize, o) + 1;
557  char *buf = xmalloc (bufsize);
558  quotearg_buffer (buf, bufsize, arg, argsize, o);
559  errno = e;
560  return buf;
561}
562
563/* Use storage slot N to return a quoted version of argument ARG.
564   ARG is of size ARGSIZE, but if that is SIZE_MAX, ARG is a
565   null-terminated string.
566   OPTIONS specifies the quoting options.
567   The returned value points to static storage that can be
568   reused by the next call to this function with the same value of N.
569   N must be nonnegative.  N is deliberately declared with type "int"
570   to allow for future extensions (using negative values).  */
571static char *
572quotearg_n_options (int n, char const *arg, size_t argsize,
573		    struct quoting_options const *options)
574{
575  int e = errno;
576
577  /* Preallocate a slot 0 buffer, so that the caller can always quote
578     one small component of a "memory exhausted" message in slot 0.  */
579  static char slot0[256];
580  static unsigned int nslots = 1;
581  unsigned int n0 = n;
582  struct slotvec
583    {
584      size_t size;
585      char *val;
586    };
587  static struct slotvec slotvec0 = {sizeof slot0, slot0};
588  static struct slotvec *slotvec = &slotvec0;
589
590  if (n < 0)
591    abort ();
592
593  if (nslots <= n0)
594    {
595      /* FIXME: technically, the type of n1 should be `unsigned int',
596	 but that evokes an unsuppressible warning from gcc-4.0.1 and
597	 older.  If gcc ever provides an option to suppress that warning,
598	 revert to the original type, so that the test in xalloc_oversized
599	 is once again performed only at compile time.  */
600      size_t n1 = n0 + 1;
601
602      if (xalloc_oversized (n1, sizeof *slotvec))
603	xalloc_die ();
604
605      if (slotvec == &slotvec0)
606	{
607	  slotvec = xmalloc (sizeof *slotvec);
608	  *slotvec = slotvec0;
609	}
610      slotvec = xrealloc (slotvec, n1 * sizeof *slotvec);
611      memset (slotvec + nslots, 0, (n1 - nslots) * sizeof *slotvec);
612      nslots = n1;
613    }
614
615  {
616    size_t size = slotvec[n].size;
617    char *val = slotvec[n].val;
618    size_t qsize = quotearg_buffer (val, size, arg, argsize, options);
619
620    if (size <= qsize)
621      {
622	slotvec[n].size = size = qsize + 1;
623	if (val != slot0)
624	  free (val);
625	slotvec[n].val = val = xmalloc (size);
626	quotearg_buffer (val, size, arg, argsize, options);
627      }
628
629    errno = e;
630    return val;
631  }
632}
633
634char *
635quotearg_n (int n, char const *arg)
636{
637  return quotearg_n_options (n, arg, SIZE_MAX, &default_quoting_options);
638}
639
640char *
641quotearg (char const *arg)
642{
643  return quotearg_n (0, arg);
644}
645
646/* Return quoting options for STYLE, with no extra quoting.  */
647static struct quoting_options
648quoting_options_from_style (enum quoting_style style)
649{
650  struct quoting_options o;
651  o.style = style;
652  memset (o.quote_these_too, 0, sizeof o.quote_these_too);
653  return o;
654}
655
656char *
657quotearg_n_style (int n, enum quoting_style s, char const *arg)
658{
659  struct quoting_options const o = quoting_options_from_style (s);
660  return quotearg_n_options (n, arg, SIZE_MAX, &o);
661}
662
663char *
664quotearg_n_style_mem (int n, enum quoting_style s,
665		      char const *arg, size_t argsize)
666{
667  struct quoting_options const o = quoting_options_from_style (s);
668  return quotearg_n_options (n, arg, argsize, &o);
669}
670
671char *
672quotearg_style (enum quoting_style s, char const *arg)
673{
674  return quotearg_n_style (0, s, arg);
675}
676
677char *
678quotearg_char (char const *arg, char ch)
679{
680  struct quoting_options options;
681  options = default_quoting_options;
682  set_char_quoting (&options, ch, 1);
683  return quotearg_n_options (0, arg, SIZE_MAX, &options);
684}
685
686char *
687quotearg_colon (char const *arg)
688{
689  return quotearg_char (arg, ':');
690}
691