1/* sort.c - put input lines into order
2 *
3 * Copyright 2004, 2008 Rob Landley <rob@landley.net>
4 *
5 * See http://opengroup.org/onlinepubs/007904975/utilities/sort.html
6 *
7 * Deviations from POSIX: Lots.
8 * We invented -x
9
10USE_SORT(NEWTOY(sort, USE_SORT_FLOAT("g")USE_SORT_BIG("S:T:m" "o:k*t:xbMcszdfi") "run", TOYFLAG_USR|TOYFLAG_BIN))
11
12config SORT
13  bool "sort"
14  default y
15  help
16    usage: sort [-run] [FILE...]
17
18    Sort all lines of text from input files (or stdin) to stdout.
19
20    -r	reverse
21    -u	unique lines only
22    -n	numeric order (instead of alphabetical)
23
24config SORT_BIG
25  bool "SuSv3 options (Support -ktcsbdfiozM)"
26  default y
27  depends on SORT
28  help
29    usage: sort [-bcdfiMsz] [-k#[,#[x]] [-t X]] [-o FILE]
30
31    -b	ignore leading blanks (or trailing blanks in second part of key)
32    -c	check whether input is sorted
33    -d	dictionary order (use alphanumeric and whitespace chars only)
34    -f	force uppercase (case insensitive sort)
35    -i	ignore nonprinting characters
36    -M	month sort (jan, feb, etc).
37    -x	Hexadecimal numerical sort
38    -s	skip fallback sort (only sort with keys)
39    -z	zero (null) terminated lines
40    -k	sort by "key" (see below)
41    -t	use a key separator other than whitespace
42    -o	output to FILE instead of stdout
43
44    Sorting by key looks at a subset of the words on each line.  -k2
45    uses the second word to the end of the line, -k2,2 looks at only
46    the second word, -k2,4 looks from the start of the second to the end
47    of the fourth word.  Specifying multiple keys uses the later keys as
48    tie breakers, in order.  A type specifier appended to a sort key
49    (such as -2,2n) applies only to sorting that key.
50
51config SORT_FLOAT
52  bool
53  default y
54  depends on SORT_BIG && TOYBOX_FLOAT
55  help
56    usage: sort [-g]
57
58    -g	general numeric sort (double precision with nan and inf)
59*/
60
61#define FOR_sort
62#include "toys.h"
63
64GLOBALS(
65  char *key_separator;
66  struct arg_list *raw_keys;
67  char *outfile;
68  char *ignore1, ignore2;   // GNU compatability NOPs for -S and -T.
69
70  void *key_list;
71  int linecount;
72  char **lines;
73)
74
75// The sort types are n, g, and M.
76// u, c, s, and z apply to top level only, not to keys.
77// b at top level implies bb.
78// The remaining options can be applied to search keys.
79
80#define FLAG_bb (1<<31)  // Ignore trailing blanks
81
82struct sort_key
83{
84  struct sort_key *next_key;  // linked list
85  unsigned range[4];          // start word, start char, end word, end char
86  int flags;
87};
88
89// Copy of the part of this string corresponding to a key/flags.
90
91static char *get_key_data(char *str, struct sort_key *key, int flags)
92{
93  int start=0, end, len, i, j;
94
95  // Special case whole string, so we don't have to make a copy
96
97  if(key->range[0]==1 && !key->range[1] && !key->range[2] && !key->range[3]
98    && !(flags&(FLAG_b|FLAG_d|FLAG_i|FLAG_bb))) return str;
99
100  // Find start of key on first pass, end on second pass
101
102  len = strlen(str);
103  for (j=0; j<2; j++) {
104    if (!key->range[2*j]) end=len;
105
106    // Loop through fields
107    else {
108      end=0;
109      for (i=1; i < key->range[2*j]+j; i++) {
110
111        // Skip leading blanks
112        if (str[end] && !TT.key_separator)
113          while (isspace(str[end])) end++;
114
115        // Skip body of key
116        for (; str[end]; end++) {
117          if (TT.key_separator) {
118            if (str[end]==*TT.key_separator) {
119              end++;
120              break;
121            }
122          } else if (isspace(str[end])) break;
123        }
124      }
125    }
126    if (!j) start=end;
127  }
128
129  // Key with explicit separator starts after the separator
130  if (TT.key_separator && str[start]==*TT.key_separator) start++;
131
132  // Strip leading and trailing whitespace if necessary
133  if (flags&FLAG_b) while (isspace(str[start])) start++;
134  if (flags&FLAG_bb) while (end>start && isspace(str[end-1])) end--;
135
136  // Handle offsets on start and end
137  if (key->range[3]) {
138    end += key->range[3]-1;
139    if (end>len) end=len;
140  }
141  if (key->range[1]) {
142    start += key->range[1]-1;
143    if (start>len) start=len;
144  }
145
146  // Make the copy
147  if (end<start) end=start;
148  str = xstrndup(str+start, end-start);
149
150  // Handle -d
151  if (flags&FLAG_d) {
152    for (start = end = 0; str[end]; end++)
153      if (isspace(str[end]) || isalnum(str[end])) str[start++] = str[end];
154    str[start] = 0;
155  }
156
157  // Handle -i
158  if (flags&FLAG_i) {
159    for (start = end = 0; str[end]; end++)
160      if (isprint(str[end])) str[start++] = str[end];
161    str[start] = 0;
162  }
163
164  return str;
165}
166
167// append a sort_key to key_list.
168
169static struct sort_key *add_key(void)
170{
171  void **stupid_compiler = &TT.key_list;
172  struct sort_key **pkey = (struct sort_key **)stupid_compiler;
173
174  while (*pkey) pkey = &((*pkey)->next_key);
175  return *pkey = xzalloc(sizeof(struct sort_key));
176}
177
178// Perform actual comparison
179static int compare_values(int flags, char *x, char *y)
180{
181  int ff = flags & (FLAG_n|FLAG_g|FLAG_M|FLAG_x);
182
183  // Ascii sort
184  if (!ff) return ((flags&FLAG_f) ? strcasecmp : strcmp)(x, y);
185
186  if (CFG_SORT_FLOAT && ff == FLAG_g) {
187    char *xx,*yy;
188    double dx = strtod(x,&xx), dy = strtod(y,&yy);
189    int xinf, yinf;
190
191    // not numbers < NaN < -infinity < numbers < +infinity
192
193    if (x==xx) return y==yy ? 0 : -1;
194    if (y==yy) return 1;
195
196    // Check for isnan
197    if (dx!=dx) return (dy!=dy) ? 0 : -1;
198    if (dy!=dy) return 1;
199
200    // Check for infinity.  (Could underflow, but avoids needing libm.)
201    xinf = (1.0/dx == 0.0);
202    yinf = (1.0/dy == 0.0);
203    if (xinf) {
204      if(dx<0) return (yinf && dy<0) ? 0 : -1;
205      return (yinf && dy>0) ? 0 : 1;
206    }
207    if (yinf) return dy<0 ? 1 : -1;
208
209    return dx>dy ? 1 : (dx<dy ? -1 : 0);
210  } else if (CFG_SORT_BIG && ff == FLAG_M) {
211    struct tm thyme;
212    int dx;
213    char *xx,*yy;
214
215    xx = strptime(x,"%b",&thyme);
216    dx = thyme.tm_mon;
217    yy = strptime(y,"%b",&thyme);
218    if (!xx) return !yy ? 0 : -1;
219    else if (!yy) return 1;
220    else return dx==thyme.tm_mon ? 0 : dx-thyme.tm_mon;
221
222  } else if (CFG_SORT_BIG && ff == FLAG_x) {
223    return strtol(x, NULL, 16)-strtol(y, NULL, 16);
224  // This has to be ff == FLAG_n
225  } else {
226    // Full floating point version of -n
227    if (CFG_SORT_FLOAT) {
228      double dx = atof(x), dy = atof(y);
229
230      return dx>dy ? 1 : (dx<dy ? -1 : 0);
231    // Integer version of -n for tiny systems
232    } else return atoi(x)-atoi(y);
233  }
234}
235
236// Callback from qsort(): Iterate through key_list and perform comparisons.
237static int compare_keys(const void *xarg, const void *yarg)
238{
239  int flags = toys.optflags, retval = 0;
240  char *x, *y, *xx = *(char **)xarg, *yy = *(char **)yarg;
241  struct sort_key *key;
242
243  if (CFG_SORT_BIG) {
244    for (key=(struct sort_key *)TT.key_list; !retval && key;
245       key = key->next_key)
246    {
247      flags = key->flags ? key->flags : toys.optflags;
248
249      // Chop out and modify key chunks, handling -dfib
250
251      x = get_key_data(xx, key, flags);
252      y = get_key_data(yy, key, flags);
253
254      retval = compare_values(flags, x, y);
255
256      // Free the copies get_key_data() made.
257
258      if (x != xx) free(x);
259      if (y != yy) free(y);
260
261      if (retval) break;
262    }
263  } else retval = compare_values(flags, xx, yy);
264
265  // Perform fallback sort if necessary (always case insensitive, no -f,
266  // the point is to get a stable order even for -f sorts)
267  if (!retval && !(CFG_SORT_BIG && (toys.optflags&FLAG_s))) {
268    flags = toys.optflags;
269    retval = strcmp(xx, yy);
270  }
271
272  return retval * ((flags&FLAG_r) ? -1 : 1);
273}
274
275// Callback from loopfiles to handle input files.
276static void sort_read(int fd, char *name)
277{
278  // Read each line from file, appending to a big array.
279
280  for (;;) {
281    char * line = (CFG_SORT_BIG && (toys.optflags&FLAG_z))
282             ? get_rawline(fd, NULL, 0) : get_line(fd);
283
284    if (!line) break;
285
286    // handle -c here so we don't allocate more memory than necessary.
287    if (CFG_SORT_BIG && (toys.optflags&FLAG_c)) {
288      int j = (toys.optflags&FLAG_u) ? -1 : 0;
289
290      if (TT.lines && compare_keys((void *)&TT.lines, &line)>j)
291        error_exit("%s: Check line %d\n", name, TT.linecount);
292      free(TT.lines);
293      TT.lines = (char **)line;
294    } else {
295      if (!(TT.linecount&63))
296        TT.lines = xrealloc(TT.lines, sizeof(char *)*(TT.linecount+64));
297      TT.lines[TT.linecount] = line;
298    }
299    TT.linecount++;
300  }
301}
302
303void sort_main(void)
304{
305  int idx, fd = 1;
306
307  // Open output file if necessary.
308  if (CFG_SORT_BIG && TT.outfile)
309    fd = xcreate(TT.outfile, O_CREAT|O_TRUNC|O_WRONLY, 0666);
310
311  // Parse -k sort keys.
312  if (CFG_SORT_BIG && TT.raw_keys) {
313    struct arg_list *arg;
314
315    for (arg = TT.raw_keys; arg; arg = arg->next) {
316      struct sort_key *key = add_key();
317      char *temp;
318      int flag;
319
320      idx = 0;
321      temp = arg->arg;
322      while (*temp) {
323        // Start of range
324        key->range[2*idx] = (unsigned)strtol(temp, &temp, 10);
325        if (*temp=='.')
326          key->range[(2*idx)+1] = (unsigned)strtol(temp+1, &temp, 10);
327
328        // Handle flags appended to a key type.
329        for (;*temp;temp++) {
330          char *temp2, *optlist;
331
332          // Note that a second comma becomes an "Unknown key" error.
333
334          if (*temp==',' && !idx++) {
335            temp++;
336            break;
337          }
338
339          // Which flag is this?
340
341          optlist = toys.which->options;
342          temp2 = strchr(optlist, *temp);
343          flag = (1<<(optlist-temp2+strlen(optlist)-1));
344
345          // Was it a flag that can apply to a key?
346
347          if (!temp2 || flag>FLAG_b
348            || (flag&(FLAG_u|FLAG_c|FLAG_s|FLAG_z)))
349          {
350            error_exit("Unknown key option.");
351          }
352          // b after , means strip _trailing_ space, not leading.
353          if (idx && flag==FLAG_b) flag = FLAG_bb;
354          key->flags |= flag;
355        }
356      }
357    }
358  }
359
360  // global b flag strips both leading and trailing spaces
361  if (toys.optflags&FLAG_b) toys.optflags |= FLAG_bb;
362
363  // If no keys, perform alphabetic sort over the whole line.
364  if (CFG_SORT_BIG && !TT.key_list) add_key()->range[0] = 1;
365
366  // Open input files and read data, populating TT.lines[TT.linecount]
367  loopfiles(toys.optargs, sort_read);
368
369  // The compare (-c) logic was handled in sort_read(),
370  // so if we got here, we're done.
371  if (CFG_SORT_BIG && (toys.optflags&FLAG_c)) goto exit_now;
372
373  // Perform the actual sort
374  qsort(TT.lines, TT.linecount, sizeof(char *), compare_keys);
375
376  // handle unique (-u)
377  if (toys.optflags&FLAG_u) {
378    int jdx;
379
380    for (jdx=0, idx=1; idx<TT.linecount; idx++) {
381      if (!compare_keys(&TT.lines[jdx], &TT.lines[idx]))
382        free(TT.lines[idx]);
383      else TT.lines[++jdx] = TT.lines[idx];
384    }
385    if (TT.linecount) TT.linecount = jdx+1;
386  }
387
388  // Output result
389  for (idx = 0; idx<TT.linecount; idx++) {
390    char *s = TT.lines[idx];
391    unsigned i = strlen(s);
392
393    if (!(toys.optflags&FLAG_z)) s[i] = '\n';
394    xwrite(fd, s, i+1);
395    if (CFG_TOYBOX_FREE) free(s);
396  }
397
398exit_now:
399  if (CFG_TOYBOX_FREE) {
400    if (fd != 1) close(fd);
401    free(TT.lines);
402  }
403}
404