sort.c revision 438ad4ce2cffc27b2471b213c48f73375608510e
1/* sort.c - put input lines into order
2 *
3 * Copyright 2004, 2008 Rob Landley <rob@landley.net>
4 *
5 * See http://opengroup.org/onlinepubs/007904975/utilities/sort.html
6
7USE_SORT(NEWTOY(sort, USE_SORT_FLOAT("g")USE_SORT_BIG("S:T:m" "o:k*t:xbMcszdfi") "run", TOYFLAG_USR|TOYFLAG_BIN))
8
9config SORT
10  bool "sort"
11  default y
12  help
13    usage: sort [-run] [FILE...]
14
15    Sort all lines of text from input files (or stdin) to stdout.
16
17    -r	reverse
18    -u	unique lines only
19    -n	numeric order (instead of alphabetical)
20
21config SORT_BIG
22  bool "SuSv3 options (Support -ktcsbdfiozM)"
23  default y
24  depends on SORT
25  help
26    usage: sort [-bcdfiMsz] [-k#[,#[x]] [-t X]] [-o FILE]
27
28    -b	ignore leading blanks (or trailing blanks in second part of key)
29    -c	check whether input is sorted
30    -d	dictionary order (use alphanumeric and whitespace chars only)
31    -f	force uppercase (case insensitive sort)
32    -i	ignore nonprinting characters
33    -M	month sort (jan, feb, etc).
34    -x	Hexadecimal numerical sort
35    -s	skip fallback sort (only sort with keys)
36    -z	zero (null) terminated input
37    -k	sort by "key" (see below)
38    -t	use a key separator other than whitespace
39    -o	output to FILE instead of stdout
40
41    Sorting by key looks at a subset of the words on each line.  -k2
42    uses the second word to the end of the line, -k2,2 looks at only
43    the second word, -k2,4 looks from the start of the second to the end
44    of the fourth word.  Specifying multiple keys uses the later keys as
45    tie breakers, in order.  A type specifier appended to a sort key
46    (such as -2,2n) applies only to sorting that key.
47
48config SORT_FLOAT
49  bool
50  default y
51  depends on SORT_BIG && TOYBOX_FLOAT
52  help
53    usage: sort [-g]
54
55    -g	general numeric sort (double precision with nan and inf)
56*/
57
58#define FOR_sort
59#include "toys.h"
60
61GLOBALS(
62  char *key_separator;
63  struct arg_list *raw_keys;
64  char *outfile;
65  char *ignore1, ignore2;   // GNU compatability NOPs for -S and -T.
66
67  void *key_list;
68  int linecount;
69  char **lines;
70)
71
72// The sort types are n, g, and M.
73// u, c, s, and z apply to top level only, not to keys.
74// b at top level implies bb.
75// The remaining options can be applied to search keys.
76
77#define FLAG_bb (1<<31)  // Ignore trailing blanks
78
79struct sort_key
80{
81  struct sort_key *next_key;  // linked list
82  unsigned range[4];          // start word, start char, end word, end char
83  int flags;
84};
85
86// Copy of the part of this string corresponding to a key/flags.
87
88static char *get_key_data(char *str, struct sort_key *key, int flags)
89{
90  int start=0, end, len, i, j;
91
92  // Special case whole string, so we don't have to make a copy
93
94  if(key->range[0]==1 && !key->range[1] && !key->range[2] && !key->range[3]
95    && !(flags&(FLAG_b&FLAG_d&FLAG_f&FLAG_i&FLAG_bb))) return str;
96
97  // Find start of key on first pass, end on second pass
98
99  len = strlen(str);
100  for (j=0; j<2; j++) {
101    if (!key->range[2*j]) end=len;
102
103    // Loop through fields
104    else {
105      end=0;
106      for (i=1; i < key->range[2*j]+j; i++) {
107
108        // Skip leading blanks
109        if (str[end] && !TT.key_separator)
110          while (isspace(str[end])) end++;
111
112        // Skip body of key
113        for (; str[end]; end++) {
114          if (TT.key_separator) {
115            if (str[end]==*TT.key_separator) break;
116          } else if (isspace(str[end])) break;
117        }
118      }
119    }
120    if (!j) start=end;
121  }
122
123  // Key with explicit separator starts after the separator
124  if (TT.key_separator && str[start]==*TT.key_separator) start++;
125
126  // Strip leading and trailing whitespace if necessary
127  if (flags&FLAG_b) while (isspace(str[start])) start++;
128  if (flags&FLAG_bb) while (end>start && isspace(str[end-1])) end--;
129
130  // Handle offsets on start and end
131  if (key->range[3]) {
132    end += key->range[3]-1;
133    if (end>len) end=len;
134  }
135  if (key->range[1]) {
136    start += key->range[1]-1;
137    if (start>len) start=len;
138  }
139
140  // Make the copy
141  if (end<start) end=start;
142  str = xstrndup(str+start, end-start);
143
144  // Handle -d
145  if (flags&FLAG_d) {
146    for (start = end = 0; str[end]; end++)
147      if (isspace(str[end]) || isalnum(str[end])) str[start++] = str[end];
148    str[start] = 0;
149  }
150
151  // Handle -i
152  if (flags&FLAG_i) {
153    for (start = end = 0; str[end]; end++)
154      if (isprint(str[end])) str[start++] = str[end];
155    str[start] = 0;
156  }
157
158  // Handle -f
159  if (flags*FLAG_f) for(i=0; str[i]; i++) str[i] = toupper(str[i]);
160
161  return str;
162}
163
164// append a sort_key to key_list.
165
166static struct sort_key *add_key(void)
167{
168  void **stupid_compiler = &TT.key_list;
169  struct sort_key **pkey = (struct sort_key **)stupid_compiler;
170
171  while (*pkey) pkey = &((*pkey)->next_key);
172  return *pkey = xzalloc(sizeof(struct sort_key));
173}
174
175// Perform actual comparison
176static int compare_values(int flags, char *x, char *y)
177{
178  int ff = flags & (FLAG_n|FLAG_g|FLAG_M|FLAG_x);
179
180  // Ascii sort
181  if (!ff) return strcmp(x, y);
182
183  if (CFG_SORT_FLOAT && ff == FLAG_g) {
184    char *xx,*yy;
185    double dx = strtod(x,&xx), dy = strtod(y,&yy);
186    int xinf, yinf;
187
188    // not numbers < NaN < -infinity < numbers < +infinity
189
190    if (x==xx) return y==yy ? 0 : -1;
191    if (y==yy) return 1;
192
193    // Check for isnan
194    if (dx!=dx) return (dy!=dy) ? 0 : -1;
195    if (dy!=dy) return 1;
196
197    // Check for infinity.  (Could underflow, but avoids needing libm.)
198    xinf = (1.0/dx == 0.0);
199    yinf = (1.0/dy == 0.0);
200    if (xinf) {
201      if(dx<0) return (yinf && dy<0) ? 0 : -1;
202      return (yinf && dy>0) ? 0 : 1;
203    }
204    if (yinf) return dy<0 ? 1 : -1;
205
206    return dx>dy ? 1 : (dx<dy ? -1 : 0);
207  } else if (CFG_SORT_BIG && ff == FLAG_M) {
208    struct tm thyme;
209    int dx;
210    char *xx,*yy;
211
212    xx = strptime(x,"%b",&thyme);
213    dx = thyme.tm_mon;
214    yy = strptime(y,"%b",&thyme);
215    if (!xx) return !yy ? 0 : -1;
216    else if (!yy) return 1;
217    else return dx==thyme.tm_mon ? 0 : dx-thyme.tm_mon;
218
219  } else if (CFG_SORT_BIG && ff == FLAG_x) {
220    return strtol(x, NULL, 16)-strtol(y, NULL, 16);
221  // This has to be ff == FLAG_n
222  } else {
223    // Full floating point version of -n
224    if (CFG_SORT_FLOAT) {
225      double dx = atof(x), dy = atof(y);
226
227      return dx>dy ? 1 : (dx<dy ? -1 : 0);
228    // Integer version of -n for tiny systems
229    } else return atoi(x)-atoi(y);
230  }
231}
232
233// Callback from qsort(): Iterate through key_list and perform comparisons.
234static int compare_keys(const void *xarg, const void *yarg)
235{
236  int flags = toys.optflags, retval = 0;
237  char *x, *y, *xx = *(char **)xarg, *yy = *(char **)yarg;
238  struct sort_key *key;
239
240  if (CFG_SORT_BIG) {
241    for (key=(struct sort_key *)TT.key_list; !retval && key;
242       key = key->next_key)
243    {
244      flags = key->flags ? key->flags : toys.optflags;
245
246      // Chop out and modify key chunks, handling -dfib
247
248      x = get_key_data(xx, key, flags);
249      y = get_key_data(yy, key, flags);
250
251      retval = compare_values(flags, x, y);
252
253      // Free the copies get_key_data() made.
254
255      if (x != xx) free(x);
256      if (y != yy) free(y);
257
258      if (retval) break;
259    }
260  } else retval = compare_values(flags, xx, yy);
261
262  // Perform fallback sort if necessary
263  if (!retval && !(CFG_SORT_BIG && (toys.optflags&FLAG_s))) {
264    retval = strcmp(xx, yy);
265    flags = toys.optflags;
266  }
267
268  return retval * ((flags&FLAG_r) ? -1 : 1);
269}
270
271// Callback from loopfiles to handle input files.
272static void sort_read(int fd, char *name)
273{
274  // Read each line from file, appending to a big array.
275
276  for (;;) {
277    char * line = (CFG_SORT_BIG && (toys.optflags&FLAG_z))
278             ? get_rawline(fd, NULL, 0) : get_line(fd);
279
280    if (!line) break;
281
282    // handle -c here so we don't allocate more memory than necessary.
283    if (CFG_SORT_BIG && (toys.optflags&FLAG_c)) {
284      int j = (toys.optflags&FLAG_u) ? -1 : 0;
285
286      if (TT.lines && compare_keys((void *)&TT.lines, &line)>j)
287        error_exit("%s: Check line %d\n", name, TT.linecount);
288      free(TT.lines);
289      TT.lines = (char **)line;
290    } else {
291      if (!(TT.linecount&63))
292        TT.lines = xrealloc(TT.lines, sizeof(char *)*(TT.linecount+64));
293      TT.lines[TT.linecount] = line;
294    }
295    TT.linecount++;
296  }
297}
298
299void sort_main(void)
300{
301  int idx, fd = 1;
302
303  // Open output file if necessary.
304  if (CFG_SORT_BIG && TT.outfile)
305    fd = xcreate(TT.outfile, O_CREAT|O_TRUNC|O_WRONLY, 0666);
306
307  // Parse -k sort keys.
308  if (CFG_SORT_BIG && TT.raw_keys) {
309    struct arg_list *arg;
310
311    for (arg = TT.raw_keys; arg; arg = arg->next) {
312      struct sort_key *key = add_key();
313      char *temp;
314      int flag;
315
316      idx = 0;
317      temp = arg->arg;
318      while (*temp) {
319        // Start of range
320        key->range[2*idx] = (unsigned)strtol(temp, &temp, 10);
321        if (*temp=='.')
322          key->range[(2*idx)+1] = (unsigned)strtol(temp+1, &temp, 10);
323
324        // Handle flags appended to a key type.
325        for (;*temp;temp++) {
326          char *temp2, *optlist;
327
328          // Note that a second comma becomes an "Unknown key" error.
329
330          if (*temp==',' && !idx++) {
331            temp++;
332            break;
333          }
334
335          // Which flag is this?
336
337          optlist = toys.which->options;
338          temp2 = strchr(optlist, *temp);
339          flag = (1<<(optlist-temp2+strlen(optlist)-1));
340
341          // Was it a flag that can apply to a key?
342
343          if (!temp2 || flag>FLAG_b
344            || (flag&(FLAG_u|FLAG_c|FLAG_s|FLAG_z)))
345          {
346            error_exit("Unknown key option.");
347          }
348          // b after , means strip _trailing_ space, not leading.
349          if (idx && flag==FLAG_b) flag = FLAG_bb;
350          key->flags |= flag;
351        }
352      }
353    }
354  }
355
356  // global b flag strips both leading and trailing spaces
357  if (toys.optflags&FLAG_b) toys.optflags |= FLAG_bb;
358
359  // If no keys, perform alphabetic sort over the whole line.
360  if (CFG_SORT_BIG && !TT.key_list) add_key()->range[0] = 1;
361
362  // Open input files and read data, populating TT.lines[TT.linecount]
363  loopfiles(toys.optargs, sort_read);
364
365  // The compare (-c) logic was handled in sort_read(),
366  // so if we got here, we're done.
367  if (CFG_SORT_BIG && (toys.optflags&FLAG_c)) goto exit_now;
368
369  // Perform the actual sort
370  qsort(TT.lines, TT.linecount, sizeof(char *), compare_keys);
371
372  // handle unique (-u)
373  if (toys.optflags&FLAG_u) {
374    int jdx;
375
376    for (jdx=0, idx=1; idx<TT.linecount; idx++) {
377      if (!compare_keys(&TT.lines[jdx], &TT.lines[idx]))
378        free(TT.lines[idx]);
379      else TT.lines[++jdx] = TT.lines[idx];
380    }
381    if (TT.linecount) TT.linecount = jdx+1;
382  }
383
384  // Output result
385  for (idx = 0; idx<TT.linecount; idx++) {
386    char *s = TT.lines[idx];
387    xwrite(fd, s, strlen(s));
388    if (CFG_TOYBOX_FREE) free(s);
389    xwrite(fd, "\n", 1);
390  }
391
392exit_now:
393  if (CFG_TOYBOX_FREE) {
394    if (fd != 1) close(fd);
395    free(TT.lines);
396  }
397}
398