doio.c revision d218f348c12b42a78fa0306d9a033bfa4f67238b
1/*
2 * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like.  Any license provided herein, whether implied or
15 * otherwise, applies only to this software file.  Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc.,
21 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA  94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/NoticeExplan/
31 */
32/*
33 * doio -	a general purpose io initiator with system call and
34 *		write logging.  See doio.h for the structure which defines
35 *		what doio requests should look like.
36 *
37 *		Currently doio can handle read,write,reada,writea,ssread,
38 *		sswrite, and many varieties of listio requests.
39 *		For disk io, if the O_SSD flag is set doio will allocate
40 *		the appropriate amount of ssd and do the transfer - thus, doio
41 *		can handle all of the primitive types of file io.
42 *
43 * programming
44 * notes:
45 * -----------
46 *	messages should generally be printed using doio_fprintf().
47 *
48 */
49
50#include <stdio.h>
51#include <errno.h>
52#include <fcntl.h>
53#include <stdlib.h>
54#include <signal.h>
55#include <string.h>
56#include <ctype.h>
57#include <unistd.h>
58#include <time.h>
59#include <stdarg.h>
60#include <sys/stat.h>
61#include <sys/param.h>
62#include <sys/types.h>
63#include <sys/sysmacros.h>
64#ifdef CRAY
65#include <sys/iosw.h>
66#endif
67#ifdef sgi
68#include <aio.h>		/* for aio_read,write */
69#include <inttypes.h>		/* for uint64_t type */
70#include <siginfo.h>		/* signal handlers & SA_SIGINFO */
71#endif
72#ifndef CRAY
73#include <sys/uio.h>		/* for struct iovec (readv) */
74#include <sys/mman.h>		/* for mmap(2) */
75#include <sys/ipc.h>		/* for i/o buffer in shared memory */
76#include <sys/shm.h>		/* for i/o buffer in shared memory */
77#endif
78#include <sys/wait.h>
79#ifdef CRAY
80#include <sys/listio.h>
81#include <sys/panic.h>
82#endif
83#include <sys/time.h>		/* for delays */
84
85#include "doio.h"
86#include "write_log.h"
87#include "random_range.h"
88#include "string_to_tokens.h"
89#include "pattern.h"
90
91#define	NMEMALLOC	32
92#define	MEM_DATA	1	/* data space                           */
93#define	MEM_SHMEM	2	/* System V shared memory               */
94#define	MEM_T3ESHMEM	3	/* T3E Shared Memory                    */
95#define	MEM_MMAP	4	/* mmap(2)                              */
96
97#define	MEMF_PRIVATE	0001
98#define	MEMF_AUTORESRV	0002
99#define	MEMF_LOCAL	0004
100#define	MEMF_SHARED	0010
101
102#define	MEMF_FIXADDR	0100
103#define	MEMF_ADDR	0200
104#define	MEMF_AUTOGROW	0400
105#define	MEMF_FILE	01000	/* regular file -- unlink on close      */
106#define	MEMF_MPIN	010000	/* use mpin(2) to lock pages in memory */
107
108struct memalloc {
109	int memtype;
110	int flags;
111	int nblks;
112	char *name;
113	void *space;		/* memory address of allocated space */
114	int fd;			/* FD open for mmaping */
115	int size;
116} Memalloc[NMEMALLOC];
117
118/*
119 * Structure for maintaining open file test descriptors.  Used by
120 * alloc_fd().
121 */
122
123struct fd_cache {
124	char c_file[MAX_FNAME_LENGTH + 1];
125	int c_oflags;
126	int c_fd;
127	long c_rtc;
128#ifdef sgi
129	int c_memalign;		/* from F_DIOINFO */
130	int c_miniosz;
131	int c_maxiosz;
132#endif
133#ifndef CRAY
134	void *c_memaddr;	/* mmapped address */
135	int c_memlen;		/* length of above region */
136#endif
137};
138
139/*
140 * Name-To-Value map
141 * Used to map cmdline arguments to values
142 */
143struct smap {
144	char *string;
145	int value;
146};
147
148struct aio_info {
149	int busy;
150	int id;
151	int fd;
152	int strategy;
153	volatile int done;
154#ifdef CRAY
155	struct iosw iosw;
156#endif
157#ifdef sgi
158	aiocb_t aiocb;
159	int aio_ret;		/* from aio_return */
160	int aio_errno;		/* from aio_error */
161#endif
162	int sig;
163	int signalled;
164	struct sigaction osa;
165};
166
167/* ---------------------------------------------------------------------------
168 *
169 * A new paradigm of doing the r/w system call where there is a "stub"
170 * function that builds the info for the system call, then does the system
171 * call; this is called by code that is common to all system calls and does
172 * the syscall return checking, async I/O wait, iosw check, etc.
173 *
174 * Flags:
175 *	WRITE, ASYNC, SSD/SDS,
176 *	FILE_LOCK, WRITE_LOG, VERIFY_DATA,
177 */
178
179struct status {
180	int rval;		/* syscall return */
181	int err;		/* errno */
182	int *aioid;		/* list of async I/O structures */
183};
184
185struct syscall_info {
186	char *sy_name;
187	int sy_type;
188	struct status *(*sy_syscall) ();
189	int (*sy_buffer) ();
190	char *(*sy_format) ();
191	int sy_flags;
192	int sy_bits;
193};
194
195#define	SY_WRITE		00001
196#define	SY_ASYNC		00010
197#define	SY_IOSW			00020
198#define	SY_SDS			00100
199
200#ifndef O_SSD
201#define O_SSD 0			/* so code compiles on a CRAY2 */
202#endif
203
204#ifdef sgi
205#define UINT64_T uint64_t
206#else
207#define UINT64_T unsigned long
208#endif
209
210#ifndef O_PARALLEL
211#define O_PARALLEL 0		/* so O_PARALLEL may be used in expressions */
212#endif
213
214#define PPID_CHECK_INTERVAL 5	/* check ppid every <-- iterations */
215#define	MAX_AIO		256	/* maximum number of async I/O ops */
216#ifdef _CRAYMPP
217#define	MPP_BUMP	16	/* page un-alignment for MPP */
218#else
219#define	MPP_BUMP	0
220#endif
221
222#define	SYSERR strerror(errno)
223
224/*
225 * getopt() string of supported cmdline arguments.
226 */
227
228#define OPTS	"aC:d:ehm:n:kr:w:vU:V:M:N:"
229
230#define DEF_RELEASE_INTERVAL	0
231
232/*
233 * Flags set in parse_cmdline() to indicate which options were selected
234 * on the cmdline.
235 */
236
237int a_opt = 0;			/* abort on data compare errors     */
238int e_opt = 0;			/* exec() after fork()'ing          */
239int C_opt = 0;			/* Data Check Type                  */
240int d_opt = 0;			/* delay between operations         */
241int k_opt = 0;			/* lock file regions during writes  */
242int m_opt = 0;			/* generate periodic messages       */
243int n_opt = 0;			/* nprocs                           */
244int r_opt = 0;			/* resource release interval        */
245int w_opt = 0;			/* file write log file              */
246int v_opt = 0;			/* verify writes if set             */
247int U_opt = 0;			/* upanic() on varios conditions    */
248int V_opt = 0;			/* over-ride default validation fd type */
249int M_opt = 0;			/* data buffer allocation types     */
250char TagName[40];		/* name of this doio (see Monster)  */
251
252/*
253 * Misc globals initialized in parse_cmdline()
254 */
255
256char *Prog = NULL;		/* set up in parse_cmdline()                */
257int Upanic_Conditions;		/* set by args to -U                        */
258int Release_Interval;		/* arg to -r                                */
259int Nprocs;			/* arg to -n                                */
260char *Write_Log;		/* arg to -w                                */
261char *Infile;			/* input file (defaults to stdin)           */
262int *Children;			/* pids of child procs                      */
263int Nchildren = 0;
264int Nsiblings = 0;		/* tfork'ed siblings                        */
265int Execd = 0;
266int Message_Interval = 0;
267int Npes = 0;			/* non-zero if built as an mpp multi-pe app */
268int Vpe = -1;			/* Virtual pe number if Npes >= 0           */
269int Reqno = 1;			/* request # - used in some error messages  */
270int Reqskipcnt = 0;		/* count of I/O requests that are skipped   */
271int Validation_Flags;
272char *(*Data_Check) ();		/* function to call for data checking       */
273int (*Data_Fill) ();		/* function to call for data filling        */
274int Nmemalloc = 0;		/* number of memory allocation strategies   */
275int delayop = 0;		/* delay between operations - type of delay */
276int delaytime = 0;		/* delay between operations - how long      */
277
278struct wlog_file Wlog;
279
280int active_mmap_rw = 0;		/* Indicates that mmapped I/O is occurring. */
281			    /* Used by sigbus_action() in the child doio. */
282int havesigint = 0;
283
284#define SKIP_REQ	-2	/* skip I/O request */
285
286/*
287 * Global file descriptors
288 */
289
290int Wfd_Append;			/* for appending to the write-log       */
291int Wfd_Random;			/* for overlaying write-log entries     */
292
293#define FD_ALLOC_INCR	32	/* allocate this many fd_map structs    */
294				/* at a time */
295
296/*
297 * Globals for tracking Sds and Core usage
298 */
299
300char *Memptr;			/* ptr to core buffer space             */
301int Memsize;			/* # bytes pointed to by Memptr         */
302				/* maintained by alloc_mem()            */
303
304int Sdsptr;			/* sds offset (always 0)                */
305int Sdssize;			/* # bytes of allocated sds space       */
306				/* Maintained by alloc_sds()            */
307char Host[16];
308char Pattern[128];
309int Pattern_Length;
310
311/*
312 * Signal handlers, and related globals
313 */
314
315char *syserrno(int err);
316void doio(void);
317void doio_delay(void);
318char *format_oflags(int oflags);
319char *format_strat(int strategy);
320char *format_rw(struct io_req *ioreq, int fd, void *buffer,
321		int signo, char *pattern, void *iosw);
322#ifdef CRAY
323char *format_sds(struct io_req *ioreq, void *buffer, int sds char *pattern);
324#endif /* CRAY */
325
326int do_read(struct io_req *req);
327int do_write(struct io_req *req);
328int lock_file_region(char *fname, int fd, int type, int start, int nbytes);
329
330#ifdef CRAY
331char *format_listio(struct io_req *ioreq, int lcmd,
332		    struct listreq *list, int nent, int fd, char *pattern);
333#endif /* CRAY */
334
335int do_listio(struct io_req *req);
336
337#if defined(_CRAY1) || defined(CRAY)
338int do_ssdio(struct io_req *req);
339#endif /* defined(_CRAY1) || defined(CRAY) */
340
341char *fmt_ioreq(struct io_req *ioreq, struct syscall_info *sy, int fd);
342
343#ifdef CRAY
344struct status *sy_listio(struct io_req *req, struct syscall_info *sysc,
345			 int fd, char *addr);
346int listio_mem(struct io_req *req, int offset, int fmstride,
347	       int *min, int *max);
348char *fmt_listio(struct io_req *req, struct syscall_info *sy,
349		 int fd, char *addr);
350#endif /* CRAY */
351
352#ifdef sgi
353struct status *sy_pread(struct io_req *req, struct syscall_info *sysc,
354			int fd, char *addr);
355struct status *sy_pwrite(struct io_req *req, struct syscall_info *sysc,
356			 int fd, char *addr);
357char *fmt_pread(struct io_req *req, struct syscall_info *sy,
358		int fd, char *addr);
359#endif /* sgi */
360
361#ifndef CRAY
362struct status *sy_readv(struct io_req *req, struct syscall_info *sysc,
363			int fd, char *addr);
364struct status *sy_writev(struct io_req *req, struct syscall_info *sysc,
365			 int fd, char *addr);
366struct status *sy_rwv(struct io_req *req, struct syscall_info *sysc,
367		      int fd, char *addr, int rw);
368char *fmt_readv(struct io_req *req, struct syscall_info *sy,
369		int fd, char *addr);
370#endif /* !CRAY */
371
372#ifdef sgi
373struct status *sy_aread(struct io_req *req, struct syscall_info *sysc,
374			int fd, char *addr);
375struct status *sy_awrite(struct io_req *req, struct syscall_info *sysc,
376			 int fd, char *addr)
377struct status *sy_arw(struct io_req *req, struct syscall_info *sysc,
378		      int fd, char *addr, int rw);
379char *fmt_aread(struct io_req *req, struct syscall_info *sy,
380		int fd, char *addr);
381#endif /* sgi */
382
383#ifndef CRAY
384struct status *sy_mmread(struct io_req *req, struct syscall_info *sysc,
385			 int fd, char *addr);
386struct status *sy_mmwrite(struct io_req *req, struct syscall_info *sysc,
387			  int fd, char *addr);
388struct status *sy_mmrw(struct io_req *req, struct syscall_info *sysc,
389		       int fd, char *addr, int rw);
390char *fmt_mmrw(struct io_req *req, struct syscall_info *sy, int fd, char *addr);
391#endif /* !CRAY */
392
393int do_rw(struct io_req *req);
394
395#ifdef sgi
396int do_fcntl(struct io_req *req);
397#endif /* sgi */
398
399#ifndef CRAY
400int do_sync(struct io_req *req);
401#endif /* !CRAY */
402
403int doio_pat_fill(char *addr, int mem_needed, char *Pattern,
404		  int Pattern_Length, int shift);
405char *doio_pat_check(char *buf, int offset, int length,
406		     char *pattern, int pattern_length, int patshift);
407char *check_file(char *file, int offset, int length, char *pattern,
408		 int pattern_length, int patshift, int fsa);
409int doio_fprintf(FILE * stream, char *format, ...);
410int alloc_mem(int nbytes);
411
412#if defined(_CRAY1) || defined(CRAY)
413int alloc_sds(int nbytes);
414#endif /* defined(_CRAY1) || defined(CRAY) */
415
416int alloc_fd(char *file, int oflags);
417struct fd_cache *alloc_fdcache(char *file, int oflags);
418
419#ifdef sgi
420void signal_info(int sig, siginfo_t * info, void *v);
421void cleanup_handler(int sig, siginfo_t * info, void *v);
422void die_handler(int sig, siginfo_t * info, void *v);
423void sigbus_handler(int sig, siginfo_t * info, void *v);
424#else /* !sgi */
425void cleanup_handler(int sig);
426void die_handler(int sig);
427
428#ifndef CRAY
429void sigbus_handler(int sig);
430#endif /* !CRAY */
431#endif /* sgi */
432
433void noop_handler(int sig);
434void sigint_handler(int sig);
435void aio_handler(int sig);
436void dump_aio(void);
437
438#ifdef sgi
439void cb_handler(sigval_t val);
440#endif /* sgi */
441
442struct aio_info *aio_slot(int aio_id);
443int aio_register(int fd, int strategy, int sig);
444int aio_unregister(int aio_id);
445
446#ifndef __linux__
447int aio_wait(int aio_id);
448#endif /* !__linux__ */
449
450char *hms(time_t t);
451int aio_done(struct aio_info *ainfo);
452void doio_upanic(int mask);
453int parse_cmdline(int argc, char **argv, char *opts);
454
455#ifndef CRAY
456void parse_memalloc(char *arg);
457void dump_memalloc(void);
458#endif /* !CRAY */
459
460void parse_delay(char *arg);
461int usage(FILE * stream);
462void help(FILE * stream);
463
464/*
465 * Upanic conditions, and a map from symbolics to values
466 */
467
468#define U_CORRUPTION	0001	/* upanic on data corruption    */
469#define U_IOSW	    	0002	/* upanic on bad iosw           */
470#define U_RVAL	    	0004	/* upanic on bad rval           */
471
472#define U_ALL	    	(U_CORRUPTION | U_IOSW | U_RVAL)
473
474struct smap Upanic_Args[] = {
475	{"corruption", U_CORRUPTION},
476	{"iosw", U_IOSW},
477	{"rval", U_RVAL},
478	{"all", U_ALL},
479	{NULL, 0}
480};
481
482struct aio_info Aio_Info[MAX_AIO];
483
484/* -C data-fill/check type */
485#define	C_DEFAULT	1
486struct smap checkmap[] = {
487	{"default", C_DEFAULT},
488	{NULL, 0},
489};
490
491/* -d option delay types */
492#define	DELAY_SELECT	1
493#define	DELAY_SLEEP	2
494#define	DELAY_SGINAP	3
495#define	DELAY_ALARM	4
496#define	DELAY_ITIMER	5	/* POSIX timer                          */
497
498struct smap delaymap[] = {
499	{"select", DELAY_SELECT},
500	{"sleep", DELAY_SLEEP},
501#ifdef sgi
502	{"sginap", DELAY_SGINAP},
503#endif
504	{"alarm", DELAY_ALARM},
505	{NULL, 0},
506};
507
508/******
509*
510* strerror() does similar actions.
511
512char *
513syserrno(int err)
514{
515    static char sys_errno[10];
516    sprintf(sys_errno, "%d", errno);
517    return(sys_errno);
518}
519
520******/
521
522int main(int argc, char **argv)
523{
524	int i, pid, stat, ex_stat;
525#ifdef CRAY
526	sigset_t omask;
527#elif defined(linux)
528	sigset_t omask, block_mask;
529#else
530	int omask;
531#endif
532	struct sigaction sa;
533
534	umask(0);		/* force new file modes to known values */
535#if _CRAYMPP
536	Npes = sysconf(_SC_CRAY_NPES);	/* must do this before parse_cmdline */
537	Vpe = sysconf(_SC_CRAY_VPE);
538#endif
539
540	TagName[0] = '\0';
541	parse_cmdline(argc, argv, OPTS);
542
543	random_range_seed(getpid());	/* initialize random number generator */
544
545	/*
546	 * If this is a re-exec of doio, jump directly into the doio function.
547	 */
548
549	if (Execd) {
550		doio();
551		exit(E_SETUP);
552	}
553
554	/*
555	 * Stop on all but a few signals...
556	 */
557	sigemptyset(&sa.sa_mask);
558	sa.sa_handler = sigint_handler;
559	sa.sa_flags = SA_RESETHAND;	/* sigint is ignored after the */
560	/* first time */
561	for (i = 1; i <= NSIG; i++) {
562		switch (i) {
563#ifdef SIGRECOVERY
564		case SIGRECOVERY:
565			break;
566#endif
567#ifdef SIGCKPT
568		case SIGCKPT:
569#endif
570#ifdef SIGRESTART
571		case SIGRESTART:
572#endif
573		case SIGTSTP:
574		case SIGSTOP:
575		case SIGCONT:
576		case SIGCLD:
577		case SIGBUS:
578		case SIGSEGV:
579		case SIGQUIT:
580			break;
581		default:
582			sigaction(i, &sa, NULL);
583		}
584	}
585
586	/*
587	 * If we're logging write operations, make a dummy call to wlog_open
588	 * to initialize the write history file.  This call must be done in
589	 * the parent, to ensure that the history file exists and/or has
590	 * been truncated before any children attempt to open it, as the doio
591	 * children are not allowed to truncate the file.
592	 */
593
594	if (w_opt) {
595		strcpy(Wlog.w_file, Write_Log);
596
597		if (wlog_open(&Wlog, 1, 0666) < 0) {
598			doio_fprintf(stderr,
599				     "Could not create/truncate write log %s\n",
600				     Write_Log);
601			exit(2);
602		}
603
604		wlog_close(&Wlog);
605	}
606
607	/*
608	 * Malloc space for the children pid array.  Initialize all entries
609	 * to -1.
610	 */
611
612	Children = malloc(sizeof(int) * Nprocs);
613	for (i = 0; i < Nprocs; i++) {
614		Children[i] = -1;
615	}
616
617	sigemptyset(&block_mask);
618	sigaddset(&block_mask, SIGCLD);
619	sigprocmask(SIG_BLOCK, &block_mask, &omask);
620
621	/*
622	 * Fork Nprocs.  This [parent] process is a watchdog, to notify the
623	 * invoker of procs which exit abnormally, and to make sure that all
624	 * child procs get cleaned up.  If the -e option was used, we will also
625	 * re-exec.  This is mostly for unicos/mk on mpp's, to ensure that not
626	 * all of the doio's don't end up in the same pe.
627	 *
628	 * Note - if Nprocs is 1, or this doio is a multi-pe app (Npes > 1),
629	 * jump directly to doio().  multi-pe apps can't fork(), and there is
630	 * no reason to fork() for 1 proc.
631	 */
632
633	if (Nprocs == 1 || Npes > 1) {
634		doio();
635		exit(0);
636	} else {
637		for (i = 0; i < Nprocs; i++) {
638			if ((pid = fork()) == -1) {
639				doio_fprintf(stderr,
640					     "(parent) Could not fork %d children:  %s (%d)\n",
641					     i + 1, SYSERR, errno);
642				exit(E_SETUP);
643			}
644
645			Children[Nchildren] = pid;
646			Nchildren++;
647
648			if (pid == 0) {
649				if (e_opt) {
650					char *exec_path;
651
652					exec_path = argv[0];
653					argv[0] = malloc(strlen(exec_path) + 2);
654					sprintf(argv[0], "-%s", exec_path);
655
656					execvp(exec_path, argv);
657					doio_fprintf(stderr,
658						     "(parent) Could not execvp %s:  %s (%d)\n",
659						     exec_path, SYSERR, errno);
660					exit(E_SETUP);
661				} else {
662					doio();
663					exit(E_SETUP);
664				}
665			}
666		}
667
668		/*
669		 * Parent spins on wait(), until all children exit.
670		 */
671
672		ex_stat = E_NORMAL;
673
674		while (Nprocs) {
675			if ((pid = wait(&stat)) == -1) {
676				if (errno == EINTR)
677					continue;
678			}
679
680			for (i = 0; i < Nchildren; i++)
681				if (Children[i] == pid)
682					Children[i] = -1;
683
684			Nprocs--;
685
686			if (WIFEXITED(stat)) {
687				switch (WEXITSTATUS(stat)) {
688				case E_NORMAL:
689					/* noop */
690					break;
691
692				case E_INTERNAL:
693					doio_fprintf(stderr,
694						     "(parent) pid %d exited because of an internal error\n",
695						     pid);
696					ex_stat |= E_INTERNAL;
697					break;
698
699				case E_SETUP:
700					doio_fprintf(stderr,
701						     "(parent) pid %d exited because of a setup error\n",
702						     pid);
703					ex_stat |= E_SETUP;
704					break;
705
706				case E_COMPARE:
707					doio_fprintf(stderr,
708						     "(parent) pid %d exited because of data compare errors\n",
709						     pid);
710
711					ex_stat |= E_COMPARE;
712
713					if (a_opt)
714						kill(0, SIGINT);
715
716					break;
717
718				case E_USAGE:
719					doio_fprintf(stderr,
720						     "(parent) pid %d exited because of a usage error\n",
721						     pid);
722
723					ex_stat |= E_USAGE;
724					break;
725
726				default:
727					doio_fprintf(stderr,
728						     "(parent) pid %d exited with unknown status %d\n",
729						     pid, WEXITSTATUS(stat));
730					ex_stat |= E_INTERNAL;
731					break;
732				}
733			} else if (WIFSIGNALED(stat)
734				   && WTERMSIG(stat) != SIGINT) {
735				doio_fprintf(stderr,
736					     "(parent) pid %d terminated by signal %d\n",
737					     pid, WTERMSIG(stat));
738
739				ex_stat |= E_SIGNAL;
740			}
741
742			fflush(NULL);
743		}
744	}
745
746	exit(ex_stat);
747
748}				/* main */
749
750/*
751 * main doio function.  Each doio child starts here, and never returns.
752 */
753
754void doio(void)
755{
756	int rval, i, infd, nbytes;
757	char *cp;
758	struct io_req ioreq;
759	struct sigaction sa, def_action, ignore_action, exit_action;
760#ifndef CRAY
761	struct sigaction sigbus_action;
762#endif
763
764	Memsize = Sdssize = 0;
765
766	/*
767	 * Initialize the Pattern - write-type syscalls will replace Pattern[1]
768	 * with the pattern passed in the request.  Make sure that
769	 * strlen(Pattern) is not mod 16 so that out of order words will be
770	 * detected.
771	 */
772
773	gethostname(Host, sizeof(Host));
774	if ((cp = strchr(Host, '.')) != NULL)
775		*cp = '\0';
776
777	Pattern_Length = sprintf(Pattern, "-:%d:%s:%s*", getpid(), Host, Prog);
778
779	if (!(Pattern_Length % 16)) {
780		Pattern_Length = sprintf(Pattern, "-:%d:%s:%s**",
781					 getpid(), Host, Prog);
782	}
783
784	/*
785	 * Open a couple of descriptors for the write-log file.  One descriptor
786	 * is for appending, one for random access.  Write logging is done for
787	 * file corruption detection.  The program doio_check is capable of
788	 * doing corruption detection based on a doio write-log.
789	 */
790
791	if (w_opt) {
792
793		strcpy(Wlog.w_file, Write_Log);
794
795		if (wlog_open(&Wlog, 0, 0666) == -1) {
796			doio_fprintf(stderr,
797				     "Could not open write log file (%s): wlog_open() failed\n",
798				     Write_Log);
799			exit(E_SETUP);
800		}
801	}
802
803	/*
804	 * Open the input stream - either a file or stdin
805	 */
806
807	if (Infile == NULL) {
808		infd = 0;
809	} else {
810		if ((infd = open(Infile, O_RDWR)) == -1) {
811			doio_fprintf(stderr,
812				     "Could not open input file (%s):  %s (%d)\n",
813				     Infile, SYSERR, errno);
814			exit(E_SETUP);
815		}
816	}
817
818	/*
819	 * Define a set of signals that should never be masked.  Receipt of
820	 * these signals generally indicates a programming error, and we want
821	 * a corefile at the point of error.  We put SIGQUIT in this list so
822	 * that ^\ will force a user core dump.
823	 *
824	 * Note:  the handler for these should be SIG_DFL, all of them
825	 * produce a corefile as the default action.
826	 */
827
828	ignore_action.sa_handler = SIG_IGN;
829	ignore_action.sa_flags = 0;
830	sigemptyset(&ignore_action.sa_mask);
831
832	def_action.sa_handler = SIG_DFL;
833	def_action.sa_flags = 0;
834	sigemptyset(&def_action.sa_mask);
835
836#ifdef sgi
837	exit_action.sa_sigaction = cleanup_handler;
838	exit_action.sa_flags = SA_SIGINFO;
839	sigemptyset(&exit_action.sa_mask);
840
841	sa.sa_sigaction = die_handler;
842	sa.sa_flags = SA_SIGINFO;
843	sigemptyset(&sa.sa_mask);
844
845	sigbus_action.sa_sigaction = sigbus_handler;
846	sigbus_action.sa_flags = SA_SIGINFO;
847	sigemptyset(&sigbus_action.sa_mask);
848#else
849	exit_action.sa_handler = cleanup_handler;
850	exit_action.sa_flags = 0;
851	sigemptyset(&exit_action.sa_mask);
852
853	sa.sa_handler = die_handler;
854	sa.sa_flags = 0;
855	sigemptyset(&sa.sa_mask);
856
857#ifndef CRAY
858	sigbus_action.sa_handler = sigbus_handler;
859	sigbus_action.sa_flags = 0;
860	sigemptyset(&sigbus_action.sa_mask);
861#endif
862#endif
863
864	for (i = 1; i <= NSIG; i++) {
865		switch (i) {
866			/* Signals to terminate program on */
867		case SIGINT:
868			sigaction(i, &exit_action, NULL);
869			break;
870
871#ifndef CRAY
872			/* This depends on active_mmap_rw */
873		case SIGBUS:
874			sigaction(i, &sigbus_action, NULL);
875			break;
876#endif
877
878			/* Signals to Ignore... */
879		case SIGSTOP:
880		case SIGCONT:
881#ifdef SIGRECOVERY
882		case SIGRECOVERY:
883#endif
884			sigaction(i, &ignore_action, NULL);
885			break;
886
887			/* Signals to trap & report & die */
888			/*case SIGTRAP: */
889			/*case SIGABRT: */
890#ifdef SIGERR			/* cray only signals */
891		case SIGERR:
892		case SIGBUFIO:
893		case SIGINFO:
894#endif
895			/*case SIGFPE: */
896		case SIGURG:
897		case SIGHUP:
898		case SIGTERM:
899		case SIGPIPE:
900		case SIGIO:
901		case SIGUSR1:
902		case SIGUSR2:
903			sigaction(i, &sa, NULL);
904			break;
905
906			/* Default Action for all other signals */
907		default:
908			sigaction(i, &def_action, NULL);
909			break;
910		}
911	}
912
913	/*
914	 * Main loop - each doio proc does this until the read returns eof (0).
915	 * Call the appropriate io function based on the request type.
916	 */
917
918	while ((nbytes = read(infd, (char *)&ioreq, sizeof(ioreq)))) {
919
920		/*
921		 * Periodically check our ppid.  If it is 1, the child exits to
922		 * help clean up in the case that the main doio process was
923		 * killed.
924		 */
925
926		if (Reqno && ((Reqno % PPID_CHECK_INTERVAL) == 0)) {
927			if (getppid() == 1) {
928				doio_fprintf(stderr,
929					     "Parent doio process has exited\n");
930				alloc_mem(-1);
931				exit(E_SETUP);
932			}
933		}
934
935		if (nbytes == -1) {
936			doio_fprintf(stderr,
937				     "read of %d bytes from input failed:  %s (%d)\n",
938				     sizeof(ioreq), SYSERR, errno);
939			alloc_mem(-1);
940			exit(E_SETUP);
941		}
942
943		if (nbytes != sizeof(ioreq)) {
944			doio_fprintf(stderr,
945				     "read wrong # bytes from input stream, expected %d, got %d\n",
946				     sizeof(ioreq), nbytes);
947			alloc_mem(-1);
948			exit(E_SETUP);
949		}
950
951		if (ioreq.r_magic != DOIO_MAGIC) {
952			doio_fprintf(stderr,
953				     "got a bad magic # from input stream.  Expected 0%o, got 0%o\n",
954				     DOIO_MAGIC, ioreq.r_magic);
955			alloc_mem(-1);
956			exit(E_SETUP);
957		}
958
959		/*
960		 * If we're on a Release_Interval multiple, relase all ssd and
961		 * core space, and close all fd's in Fd_Map[].
962		 */
963
964		if (Reqno && Release_Interval && !(Reqno % Release_Interval)) {
965			if (Memsize) {
966#ifdef NOTDEF
967				sbrk(-1 * Memsize);
968#else
969				alloc_mem(-1);
970#endif
971			}
972#ifdef _CRAY1
973			if (Sdssize) {
974				ssbreak(-1 * btoc(Sdssize));
975				Sdsptr = 0;
976				Sdssize = 0;
977			}
978#endif /* _CRAY1 */
979
980			alloc_fd(NULL, 0);
981		}
982
983		switch (ioreq.r_type) {
984		case READ:
985		case READA:
986			rval = do_read(&ioreq);
987			break;
988
989		case WRITE:
990		case WRITEA:
991			rval = do_write(&ioreq);
992			break;
993
994		case READV:
995		case AREAD:
996		case PREAD:
997		case LREAD:
998		case LREADA:
999		case LSREAD:
1000		case LSREADA:
1001		case WRITEV:
1002		case AWRITE:
1003		case PWRITE:
1004		case MMAPR:
1005		case MMAPW:
1006		case LWRITE:
1007		case LWRITEA:
1008		case LSWRITE:
1009		case LSWRITEA:
1010		case LEREAD:
1011		case LEREADA:
1012		case LEWRITE:
1013		case LEWRITEA:
1014			rval = do_rw(&ioreq);
1015			break;
1016
1017#ifdef CRAY
1018		case SSREAD:
1019		case SSWRITE:
1020			rval = do_ssdio(&ioreq);
1021			break;
1022
1023		case LISTIO:
1024			rval = do_listio(&ioreq);
1025			break;
1026#endif
1027
1028#ifdef sgi
1029		case RESVSP:
1030		case UNRESVSP:
1031#ifdef F_FSYNC
1032		case DFFSYNC:
1033#endif
1034			rval = do_fcntl(&ioreq);
1035			break;
1036#endif /* sgi */
1037
1038#ifndef CRAY
1039		case FSYNC2:
1040		case FDATASYNC:
1041			rval = do_sync(&ioreq);
1042			break;
1043#endif
1044		default:
1045			doio_fprintf(stderr,
1046				     "Don't know how to handle io request type %d\n",
1047				     ioreq.r_type);
1048			alloc_mem(-1);
1049			exit(E_SETUP);
1050		}
1051
1052		if (rval == SKIP_REQ) {
1053			Reqskipcnt++;
1054		} else if (rval != 0) {
1055			alloc_mem(-1);
1056			doio_fprintf(stderr,
1057				     "doio(): operation %d returned != 0\n",
1058				     ioreq.r_type);
1059			exit(E_SETUP);
1060		}
1061
1062		if (Message_Interval && Reqno % Message_Interval == 0) {
1063			doio_fprintf(stderr,
1064				     "Info:  %d requests done (%d skipped) by this process\n",
1065				     Reqno, Reqskipcnt);
1066		}
1067
1068		Reqno++;
1069
1070		if (delayop != 0)
1071			doio_delay();
1072	}
1073
1074	/*
1075	 * Child exits normally
1076	 */
1077	alloc_mem(-1);
1078	exit(E_NORMAL);
1079
1080}				/* doio */
1081
1082void doio_delay(void)
1083{
1084	struct timeval tv_delay;
1085	struct sigaction sa_al, sa_old;
1086	sigset_t al_mask;
1087
1088	switch (delayop) {
1089	case DELAY_SELECT:
1090		tv_delay.tv_sec = delaytime / 1000000;
1091		tv_delay.tv_usec = delaytime % 1000000;
1092		/*doio_fprintf(stdout, "delay_select: %d %d\n",
1093		   tv_delay.tv_sec, tv_delay.tv_usec); */
1094		select(0, NULL, NULL, NULL, &tv_delay);
1095		break;
1096
1097	case DELAY_SLEEP:
1098		sleep(delaytime);
1099		break;
1100
1101#ifdef sgi
1102	case DELAY_SGINAP:
1103		sginap(delaytime);
1104		break;
1105#endif
1106
1107	case DELAY_ALARM:
1108		sa_al.sa_flags = 0;
1109		sa_al.sa_handler = noop_handler;
1110		sigemptyset(&sa_al.sa_mask);
1111		sigaction(SIGALRM, &sa_al, &sa_old);
1112		sigemptyset(&al_mask);
1113		alarm(delaytime);
1114		sigsuspend(&al_mask);
1115		sigaction(SIGALRM, &sa_old, 0);
1116		break;
1117	}
1118}
1119
1120/*
1121 * Format IO requests, returning a pointer to the formatted text.
1122 *
1123 * format_strat	- formats the async i/o completion strategy
1124 * format_rw	- formats a read[a]/write[a] request
1125 * format_sds	- formats a ssread/sswrite request
1126 * format_listio- formats a listio request
1127 *
1128 * ioreq is the doio io request structure.
1129 */
1130
1131struct smap sysnames[] = {
1132	{"READ", READ},
1133	{"WRITE", WRITE},
1134	{"READA", READA},
1135	{"WRITEA", WRITEA},
1136	{"SSREAD", SSREAD},
1137	{"SSWRITE", SSWRITE},
1138	{"LISTIO", LISTIO},
1139	{"LREAD", LREAD},
1140	{"LREADA", LREADA},
1141	{"LWRITE", LWRITE},
1142	{"LWRITEA", LWRITEA},
1143	{"LSREAD", LSREAD},
1144	{"LSREADA", LSREADA},
1145	{"LSWRITE", LSWRITE},
1146	{"LSWRITEA", LSWRITEA},
1147
1148	/* Irix System Calls */
1149	{"PREAD", PREAD},
1150	{"PWRITE", PWRITE},
1151	{"AREAD", AREAD},
1152	{"AWRITE", AWRITE},
1153	{"LLREAD", LLREAD},
1154	{"LLAREAD", LLAREAD},
1155	{"LLWRITE", LLWRITE},
1156	{"LLAWRITE", LLAWRITE},
1157	{"RESVSP", RESVSP},
1158	{"UNRESVSP", UNRESVSP},
1159	{"DFFSYNC", DFFSYNC},
1160
1161	/* Irix and Linux System Calls */
1162	{"READV", READV},
1163	{"WRITEV", WRITEV},
1164	{"MMAPR", MMAPR},
1165	{"MMAPW", MMAPW},
1166	{"FSYNC2", FSYNC2},
1167	{"FDATASYNC", FDATASYNC},
1168
1169	{"unknown", -1},
1170};
1171
1172struct smap aionames[] = {
1173	{"poll", A_POLL},
1174	{"signal", A_SIGNAL},
1175	{"recall", A_RECALL},
1176	{"recalla", A_RECALLA},
1177	{"recalls", A_RECALLS},
1178	{"suspend", A_SUSPEND},
1179	{"callback", A_CALLBACK},
1180	{"synch", 0},
1181	{"unknown", -1},
1182};
1183
1184char *format_oflags(int oflags)
1185{
1186	char flags[255];
1187
1188	flags[0] = '\0';
1189	switch (oflags & 03) {
1190	case O_RDONLY:
1191		strcat(flags, "O_RDONLY,");
1192		break;
1193	case O_WRONLY:
1194		strcat(flags, "O_WRONLY,");
1195		break;
1196	case O_RDWR:
1197		strcat(flags, "O_RDWR,");
1198		break;
1199	default:
1200		strcat(flags, "O_weird");
1201		break;
1202	}
1203
1204	if (oflags & O_EXCL)
1205		strcat(flags, "O_EXCL,");
1206
1207	if (oflags & O_SYNC)
1208		strcat(flags, "O_SYNC,");
1209#ifdef CRAY
1210	if (oflags & O_RAW)
1211		strcat(flags, "O_RAW,");
1212	if (oflags & O_WELLFORMED)
1213		strcat(flags, "O_WELLFORMED,");
1214#ifdef O_SSD
1215	if (oflags & O_SSD)
1216		strcat(flags, "O_SSD,");
1217#endif
1218	if (oflags & O_LDRAW)
1219		strcat(flags, "O_LDRAW,");
1220	if (oflags & O_PARALLEL)
1221		strcat(flags, "O_PARALLEL,");
1222	if (oflags & O_BIG)
1223		strcat(flags, "O_BIG,");
1224	if (oflags & O_PLACE)
1225		strcat(flags, "O_PLACE,");
1226	if (oflags & O_ASYNC)
1227		strcat(flags, "O_ASYNC,");
1228#endif
1229
1230#ifdef sgi
1231	if (oflags & O_DIRECT)
1232		strcat(flags, "O_DIRECT,");
1233	if (oflags & O_DSYNC)
1234		strcat(flags, "O_DSYNC,");
1235	if (oflags & O_RSYNC)
1236		strcat(flags, "O_RSYNC,");
1237#endif
1238
1239	return (strdup(flags));
1240}
1241
1242char *format_strat(int strategy)
1243{
1244	char msg[64];
1245	char *aio_strat;
1246
1247	switch (strategy) {
1248	case A_POLL:
1249		aio_strat = "POLL";
1250		break;
1251	case A_SIGNAL:
1252		aio_strat = "SIGNAL";
1253		break;
1254	case A_RECALL:
1255		aio_strat = "RECALL";
1256		break;
1257	case A_RECALLA:
1258		aio_strat = "RECALLA";
1259		break;
1260	case A_RECALLS:
1261		aio_strat = "RECALLS";
1262		break;
1263	case A_SUSPEND:
1264		aio_strat = "SUSPEND";
1265		break;
1266	case A_CALLBACK:
1267		aio_strat = "CALLBACK";
1268		break;
1269	case 0:
1270		aio_strat = "<zero>";
1271		break;
1272	default:
1273		sprintf(msg, "<error:%#o>", strategy);
1274		aio_strat = strdup(msg);
1275		break;
1276	}
1277
1278	return (aio_strat);
1279}
1280
1281char *format_rw(struct io_req *ioreq, int fd, void *buffer, int signo,
1282		char *pattern, void *iosw)
1283{
1284	static char *errbuf = NULL;
1285	char *aio_strat, *cp;
1286	struct read_req *readp = &ioreq->r_data.read;
1287	struct write_req *writep = &ioreq->r_data.write;
1288	struct read_req *readap = &ioreq->r_data.read;
1289	struct write_req *writeap = &ioreq->r_data.write;
1290
1291	if (errbuf == NULL)
1292		errbuf = malloc(32768);
1293
1294	cp = errbuf;
1295	cp += sprintf(cp, "Request number %d\n", Reqno);
1296
1297	switch (ioreq->r_type) {
1298	case READ:
1299		cp += sprintf(cp, "syscall:  read(%d, %#lo, %d)\n",
1300			      fd, (unsigned long)buffer, readp->r_nbytes);
1301		cp +=
1302		    sprintf(cp,
1303			    "          fd %d is file %s - open flags are %#o\n",
1304			    fd, readp->r_file, readp->r_oflags);
1305		cp +=
1306		    sprintf(cp, "          read done at file offset %d\n",
1307			    readp->r_offset);
1308		break;
1309
1310	case WRITE:
1311		cp += sprintf(cp, "syscall:  write(%d, %#lo, %d)\n",
1312			      fd, (unsigned long)buffer, writep->r_nbytes);
1313		cp +=
1314		    sprintf(cp,
1315			    "          fd %d is file %s - open flags are %#o\n",
1316			    fd, writep->r_file, writep->r_oflags);
1317		cp +=
1318		    sprintf(cp,
1319			    "          write done at file offset %d - pattern is %s\n",
1320			    writep->r_offset, pattern);
1321		break;
1322
1323	case READA:
1324		aio_strat = format_strat(readap->r_aio_strat);
1325
1326		cp += sprintf(cp, "syscall:  reada(%d, %#lo, %d, %#lo, %d)\n",
1327			      fd, (unsigned long)buffer, readap->r_nbytes,
1328			      (unsigned long)iosw, signo);
1329		cp +=
1330		    sprintf(cp,
1331			    "          fd %d is file %s - open flags are %#o\n",
1332			    fd, readap->r_file, readp->r_oflags);
1333		cp +=
1334		    sprintf(cp, "          reada done at file offset %d\n",
1335			    readap->r_offset);
1336		cp +=
1337		    sprintf(cp,
1338			    "          async io completion strategy is %s\n",
1339			    aio_strat);
1340		break;
1341
1342	case WRITEA:
1343		aio_strat = format_strat(writeap->r_aio_strat);
1344
1345		cp += sprintf(cp, "syscall:  writea(%d, %#lo, %d, %#lo, %d)\n",
1346			      fd, (unsigned long)buffer, writeap->r_nbytes,
1347			      (unsigned long)iosw, signo);
1348		cp +=
1349		    sprintf(cp,
1350			    "          fd %d is file %s - open flags are %#o\n",
1351			    fd, writeap->r_file, writeap->r_oflags);
1352		cp +=
1353		    sprintf(cp,
1354			    "          writea done at file offset %d - pattern is %s\n",
1355			    writeap->r_offset, pattern);
1356		cp +=
1357		    sprintf(cp,
1358			    "          async io completion strategy is %s\n",
1359			    aio_strat);
1360		break;
1361
1362	}
1363
1364	return errbuf;
1365}
1366
1367#ifdef CRAY
1368char *format_sds(struct io_req *ioreq, void *buffer, int sds, char *pattern)
1369{
1370	int i;
1371	static char *errbuf = NULL;
1372	char *cp;
1373
1374	struct ssread_req *ssreadp = &ioreq->r_data.ssread;
1375	struct sswrite_req *sswritep = &ioreq->r_data.sswrite;
1376
1377	if (errbuf == NULL)
1378		errbuf = malloc(32768);
1379
1380	cp = errbuf;
1381	cp += sprintf(cp, "Request number %d\n", Reqno);
1382
1383	switch (ioreq->r_type) {
1384	case SSREAD:
1385		cp += sprintf(cp, "syscall:  ssread(%#o, %#o, %d)\n",
1386			      buffer, sds, ssreadp->r_nbytes);
1387		break;
1388
1389	case SSWRITE:
1390		cp +=
1391		    sprintf(cp,
1392			    "syscall:  sswrite(%#o, %#o, %d) - pattern was %s\n",
1393			    buffer, sds, sswritep->r_nbytes, pattern);
1394		break;
1395	}
1396	return errbuf;
1397}
1398#endif /* CRAY */
1399
1400/*
1401 * Perform the various sorts of disk reads
1402 */
1403
1404int do_read(struct io_req *req)
1405{
1406	int fd, offset, nbytes, oflags, rval;
1407	char *addr, *file;
1408#ifdef CRAY
1409	struct aio_info *aiop;
1410	int aio_id, aio_strat, signo;
1411#endif
1412#ifdef sgi
1413	struct fd_cache *fdc;
1414#endif
1415
1416	/*
1417	 * Initialize common fields - assumes r_oflags, r_file, r_offset, and
1418	 * r_nbytes are at the same offset in the read_req and reada_req
1419	 * structures.
1420	 */
1421
1422	file = req->r_data.read.r_file;
1423	oflags = req->r_data.read.r_oflags;
1424	offset = req->r_data.read.r_offset;
1425	nbytes = req->r_data.read.r_nbytes;
1426
1427	/*printf("read: %s, %#o, %d %d\n", file, oflags, offset, nbytes); */
1428
1429	/*
1430	 * Grab an open file descriptor
1431	 * Note: must be done before memory allocation so that the direct i/o
1432	 *      information is available in mem. allocate
1433	 */
1434
1435	if ((fd = alloc_fd(file, oflags)) == -1)
1436		return -1;
1437
1438	/*
1439	 * Allocate core or sds - based on the O_SSD flag
1440	 */
1441
1442#ifndef wtob
1443#define wtob(x)	(x * sizeof(UINT64_T))
1444#endif
1445
1446#ifdef CRAY
1447	if (oflags & O_SSD) {
1448		if (alloc_sds(nbytes) == -1)
1449			return -1;
1450
1451		addr = (char *)Sdsptr;
1452	} else {
1453		if ((rval =
1454		     alloc_mem(nbytes + wtob(1) * 2 +
1455			       MPP_BUMP * sizeof(UINT64_T))) < 0) {
1456			return rval;
1457		}
1458
1459		addr = Memptr;
1460
1461		/*
1462		 * if io is not raw, bump the offset by a random amount
1463		 * to generate non-word-aligned io.
1464		 */
1465		if (!(req->r_data.read.r_uflags & F_WORD_ALIGNED)) {
1466			addr += random_range(0, wtob(1) - 1, 1, NULL);
1467		}
1468	}
1469#else
1470#ifdef sgi
1471	/* get memory alignment for using DIRECT I/O */
1472	fdc = alloc_fdcache(file, oflags);
1473
1474	if ((rval = alloc_mem(nbytes + wtob(1) * 2 + fdc->c_memalign)) < 0) {
1475		return rval;
1476	}
1477
1478	addr = Memptr;
1479
1480	if ((req->r_data.read.r_uflags & F_WORD_ALIGNED)) {
1481		/*
1482		 * Force memory alignment for Direct I/O
1483		 */
1484		if ((oflags & O_DIRECT) && ((long)addr % fdc->c_memalign != 0)) {
1485			addr +=
1486			    fdc->c_memalign - ((long)addr % fdc->c_memalign);
1487		}
1488	} else {
1489		addr += random_range(0, wtob(1) - 1, 1, NULL);
1490	}
1491#else
1492	/* what is !CRAY && !sgi ? */
1493	if ((rval = alloc_mem(nbytes + wtob(1) * 2)) < 0) {
1494		return rval;
1495	}
1496
1497	addr = Memptr;
1498#endif /* !CRAY && sgi */
1499#endif /* CRAY */
1500
1501	switch (req->r_type) {
1502	case READ:
1503		/* move to the desired file position. */
1504		if (lseek(fd, offset, SEEK_SET) == -1) {
1505			doio_fprintf(stderr,
1506				     "lseek(%d, %d, SEEK_SET) failed:  %s (%d)\n",
1507				     fd, offset, SYSERR, errno);
1508			return -1;
1509		}
1510
1511		if ((rval = read(fd, addr, nbytes)) == -1) {
1512			doio_fprintf(stderr,
1513				     "read() request failed:  %s (%d)\n%s\n",
1514				     SYSERR, errno,
1515				     format_rw(req, fd, addr, -1, NULL, NULL));
1516			doio_upanic(U_RVAL);
1517			return -1;
1518		} else if (rval != nbytes) {
1519			doio_fprintf(stderr,
1520				     "read() request returned wrong # of bytes - expected %d, got %d\n%s\n",
1521				     nbytes, rval,
1522				     format_rw(req, fd, addr, -1, NULL, NULL));
1523			doio_upanic(U_RVAL);
1524			return -1;
1525		}
1526		break;
1527
1528#ifdef CRAY
1529	case READA:
1530		/*
1531		 * Async read
1532		 */
1533
1534		/* move to the desired file position. */
1535		if (lseek(fd, offset, SEEK_SET) == -1) {
1536			doio_fprintf(stderr,
1537				     "lseek(%d, %d, SEEK_SET) failed:  %s (%d)\n",
1538				     fd, offset, SYSERR, errno);
1539			return -1;
1540		}
1541
1542		aio_strat = req->r_data.read.r_aio_strat;
1543		signo = (aio_strat == A_SIGNAL) ? SIGUSR1 : 0;
1544
1545		aio_id = aio_register(fd, aio_strat, signo);
1546		aiop = aio_slot(aio_id);
1547
1548		if (reada(fd, addr, nbytes, &aiop->iosw, signo) == -1) {
1549			doio_fprintf(stderr, "reada() failed: %s (%d)\n%s\n",
1550				     SYSERR, errno,
1551				     format_rw(req, fd, addr, signo, NULL,
1552					       &aiop->iosw));
1553			aio_unregister(aio_id);
1554			doio_upanic(U_RVAL);
1555			rval = -1;
1556		} else {
1557			/*
1558			 * Wait for io to complete
1559			 */
1560
1561			aio_wait(aio_id);
1562
1563			/*
1564			 * make sure the io completed without error
1565			 */
1566
1567			if (aiop->iosw.sw_count != nbytes) {
1568				doio_fprintf(stderr,
1569					     "Bad iosw from reada()\nExpected (%d,%d,%d), got (%d,%d,%d)\n%s\n",
1570					     1, 0, nbytes,
1571					     aiop->iosw.sw_flag,
1572					     aiop->iosw.sw_error,
1573					     aiop->iosw.sw_count,
1574					     format_rw(req, fd, addr, signo,
1575						       NULL, &aiop->iosw));
1576				aio_unregister(aio_id);
1577				doio_upanic(U_IOSW);
1578				rval = -1;
1579			} else {
1580				aio_unregister(aio_id);
1581				rval = 0;
1582			}
1583		}
1584
1585		if (rval == -1)
1586			return rval;
1587		break;
1588#endif /* CRAY */
1589	}
1590
1591	return 0;		/* if we get here, everything went ok */
1592}
1593
1594/*
1595 * Perform the verious types of disk writes.
1596 */
1597
1598int do_write(struct io_req *req)
1599{
1600	static int pid = -1;
1601	int fd, nbytes, oflags, signo;
1602	int logged_write, rval, got_lock;
1603	off_t offset, woffset;
1604	char *addr, pattern, *file, *msg;
1605	struct wlog_rec wrec;
1606#ifdef CRAY
1607	int aio_strat, aio_id;
1608	struct aio_info *aiop;
1609#endif
1610#ifdef sgi
1611	struct fd_cache *fdc;
1612#endif
1613
1614	woffset = 0;
1615
1616	/*
1617	 * Misc variable setup
1618	 */
1619
1620	signo = 0;
1621	nbytes = req->r_data.write.r_nbytes;
1622	offset = req->r_data.write.r_offset;
1623	pattern = req->r_data.write.r_pattern;
1624	file = req->r_data.write.r_file;
1625	oflags = req->r_data.write.r_oflags;
1626
1627	/*printf("pwrite: %s, %#o, %d %d\n", file, oflags, offset, nbytes); */
1628
1629	/*
1630	 * Allocate core memory and possibly sds space.  Initialize the data
1631	 * to be written.
1632	 */
1633
1634	Pattern[0] = pattern;
1635
1636	/*
1637	 * Get a descriptor to do the io on
1638	 */
1639
1640	if ((fd = alloc_fd(file, oflags)) == -1)
1641		return -1;
1642
1643	/*printf("write: %d, %s, %#o, %d %d\n",
1644	   fd, file, oflags, offset, nbytes); */
1645
1646	/*
1647	 * Allocate SDS space for backdoor write if desired
1648	 */
1649
1650#ifdef CRAY
1651	if (oflags & O_SSD) {
1652#ifndef _CRAYMPP
1653		if ((rval = alloc_mem(nbytes + wtob(1))) < 0) {
1654			return rval;
1655		}
1656
1657		(*Data_Fill) (Memptr, nbytes, Pattern, Pattern_Length, 0);
1658		/*pattern_fill(Memptr, nbytes, Pattern, Pattern_Length, 0); */
1659
1660		if (alloc_sds(nbytes) == -1)
1661			return -1;
1662
1663		if (sswrite((long)Memptr, Sdsptr, btoc(nbytes)) == -1) {
1664			doio_fprintf(stderr,
1665				     "sswrite(%d, %d, %d) failed:  %s (%d)\n",
1666				     (long)Memptr, Sdsptr, btoc(nbytes), SYSERR,
1667				     errno);
1668			fflush(stderr);
1669			return -1;
1670		}
1671
1672		addr = (char *)Sdsptr;
1673#else
1674		doio_fprintf(stderr,
1675			     "Invalid O_SSD flag was generated for MPP system\n");
1676		fflush(stderr);
1677		return -1;
1678#endif /* !CRAYMPP */
1679	} else {
1680		if ((rval = alloc_mem(nbytes + wtob(1)) < 0)) {
1681			return rval;
1682		}
1683
1684		addr = Memptr;
1685
1686		/*
1687		 * if io is not raw, bump the offset by a random amount
1688		 * to generate non-word-aligned io.
1689		 */
1690
1691		if (!(req->r_data.write.r_uflags & F_WORD_ALIGNED)) {
1692			addr += random_range(0, wtob(1) - 1, 1, NULL);
1693		}
1694
1695		(*Data_Fill) (Memptr, nbytes, Pattern, Pattern_Length, 0);
1696		if (addr != Memptr)
1697			memmove(addr, Memptr, nbytes);
1698	}
1699#else /* CRAY */
1700#ifdef sgi
1701	/* get memory alignment for using DIRECT I/O */
1702	fdc = alloc_fdcache(file, oflags);
1703
1704	if ((rval = alloc_mem(nbytes + wtob(1) * 2 + fdc->c_memalign)) < 0) {
1705		return rval;
1706	}
1707
1708	addr = Memptr;
1709
1710	if ((req->r_data.write.r_uflags & F_WORD_ALIGNED)) {
1711		/*
1712		 * Force memory alignment for Direct I/O
1713		 */
1714		if ((oflags & O_DIRECT) && ((long)addr % fdc->c_memalign != 0)) {
1715			addr +=
1716			    fdc->c_memalign - ((long)addr % fdc->c_memalign);
1717		}
1718	} else {
1719		addr += random_range(0, wtob(1) - 1, 1, NULL);
1720	}
1721
1722	(*Data_Fill) (Memptr, nbytes, Pattern, Pattern_Length, 0);
1723	if (addr != Memptr)
1724		memmove(addr, Memptr, nbytes);
1725
1726#else /* sgi */
1727	if ((rval = alloc_mem(nbytes + wtob(1) * 2)) < 0) {
1728		return rval;
1729	}
1730
1731	addr = Memptr;
1732
1733	(*Data_Fill) (Memptr, nbytes, Pattern, Pattern_Length, 0);
1734	if (addr != Memptr)
1735		memmove(addr, Memptr, nbytes);
1736#endif /* sgi */
1737#endif /* CRAY */
1738
1739	rval = -1;
1740	got_lock = 0;
1741	logged_write = 0;
1742
1743	if (k_opt) {
1744		if (lock_file_region(file, fd, F_WRLCK, offset, nbytes) < 0) {
1745			alloc_mem(-1);
1746			exit(E_INTERNAL);
1747		}
1748
1749		got_lock = 1;
1750	}
1751
1752	/*
1753	 * Write a preliminary write-log entry.  This is done so that
1754	 * doio_check can do corruption detection across an interrupt/crash.
1755	 * Note that w_done is set to 0.  If doio_check sees this, it
1756	 * re-creates the file extents as if the write completed, but does not
1757	 * do any checking - see comments in doio_check for more details.
1758	 */
1759
1760	if (w_opt) {
1761		if (pid == -1) {
1762			pid = getpid();
1763		}
1764		wrec.w_async = (req->r_type == WRITEA) ? 1 : 0;
1765		wrec.w_oflags = oflags;
1766		wrec.w_pid = pid;
1767		wrec.w_offset = offset;
1768		wrec.w_nbytes = nbytes;
1769
1770		wrec.w_pathlen = strlen(file);
1771		memcpy(wrec.w_path, file, wrec.w_pathlen);
1772		wrec.w_hostlen = strlen(Host);
1773		memcpy(wrec.w_host, Host, wrec.w_hostlen);
1774		wrec.w_patternlen = Pattern_Length;
1775		memcpy(wrec.w_pattern, Pattern, wrec.w_patternlen);
1776
1777		wrec.w_done = 0;
1778
1779		if ((woffset = wlog_record_write(&Wlog, &wrec, -1)) == -1) {
1780			doio_fprintf(stderr,
1781				     "Could not append to write-log:  %s (%d)\n",
1782				     SYSERR, errno);
1783		} else {
1784			logged_write = 1;
1785		}
1786	}
1787
1788	switch (req->r_type) {
1789	case WRITE:
1790		/*
1791		 * sync write
1792		 */
1793
1794		if (lseek(fd, offset, SEEK_SET) == -1) {
1795			doio_fprintf(stderr,
1796				     "lseek(%d, %d, SEEK_SET) failed:  %s (%d)\n",
1797				     fd, offset, SYSERR, errno);
1798			return -1;
1799		}
1800
1801		rval = write(fd, addr, nbytes);
1802
1803		if (rval == -1) {
1804			doio_fprintf(stderr,
1805				     "write() failed:  %s (%d)\n%s\n",
1806				     SYSERR, errno,
1807				     format_rw(req, fd, addr, -1, Pattern,
1808					       NULL));
1809#ifdef sgi
1810			doio_fprintf(stderr,
1811				     "write() failed:  %s\n\twrite(%d, %#o, %d)\n\toffset %d, nbytes%%miniou(%d)=%d, oflags=%#o memalign=%d, addr%%memalign=%d\n",
1812				     strerror(errno),
1813				     fd, addr, nbytes,
1814				     offset,
1815				     fdc->c_miniosz, nbytes % fdc->c_miniosz,
1816				     oflags, fdc->c_memalign,
1817				     (long)addr % fdc->c_memalign);
1818#else
1819			doio_fprintf(stderr,
1820				     "write() failed:  %s\n\twrite(%d, %#o, %d)\n\toffset %d, nbytes%%1B=%d, oflags=%#o\n",
1821				     strerror(errno),
1822				     fd, addr, nbytes,
1823				     offset, nbytes % 4096, oflags);
1824#endif
1825			doio_upanic(U_RVAL);
1826		} else if (rval != nbytes) {
1827			doio_fprintf(stderr,
1828				     "write() returned wrong # bytes - expected %d, got %d\n%s\n",
1829				     nbytes, rval,
1830				     format_rw(req, fd, addr, -1, Pattern,
1831					       NULL));
1832			doio_upanic(U_RVAL);
1833			rval = -1;
1834		}
1835
1836		break;
1837
1838#ifdef CRAY
1839	case WRITEA:
1840		/*
1841		 * async write
1842		 */
1843		if (lseek(fd, offset, SEEK_SET) == -1) {
1844			doio_fprintf(stderr,
1845				     "lseek(%d, %d, SEEK_SET) failed:  %s (%d)\n",
1846				     fd, offset, SYSERR, errno);
1847			return -1;
1848		}
1849
1850		aio_strat = req->r_data.write.r_aio_strat;
1851		signo = (aio_strat == A_SIGNAL) ? SIGUSR1 : 0;
1852
1853		aio_id = aio_register(fd, aio_strat, signo);
1854		aiop = aio_slot(aio_id);
1855
1856		/*
1857		 * init iosw and do the async write
1858		 */
1859
1860		if (writea(fd, addr, nbytes, &aiop->iosw, signo) == -1) {
1861			doio_fprintf(stderr,
1862				     "writea() failed: %s (%d)\n%s\n",
1863				     SYSERR, errno,
1864				     format_rw(req, fd, addr, -1, Pattern,
1865					       NULL));
1866			doio_upanic(U_RVAL);
1867			aio_unregister(aio_id);
1868			rval = -1;
1869		} else {
1870
1871			/*
1872			 * Wait for io to complete
1873			 */
1874
1875			aio_wait(aio_id);
1876
1877			/*
1878			 * check that iosw is ok
1879			 */
1880
1881			if (aiop->iosw.sw_count != nbytes) {
1882				doio_fprintf(stderr,
1883					     "Bad iosw from writea()\nExpected (%d,%d,%d), got (%d,%d,%d)\n%s\n",
1884					     1, 0, nbytes,
1885					     aiop->iosw.sw_flag,
1886					     aiop->iosw.sw_error,
1887					     aiop->iosw.sw_count,
1888					     format_rw(req, fd, addr, -1,
1889						       Pattern, &aiop->iosw));
1890				aio_unregister(aio_id);
1891				doio_upanic(U_IOSW);
1892				rval = -1;
1893			} else {
1894				aio_unregister(aio_id);
1895				rval = 0;
1896			}
1897		}
1898		break;
1899
1900#endif /* CRAY */
1901	}
1902
1903	/*
1904	 * Verify that the data was written correctly - check_file() returns
1905	 * a non-null pointer which contains an error message if there are
1906	 * problems.
1907	 */
1908
1909	if (v_opt) {
1910		msg = check_file(file, offset, nbytes, Pattern, Pattern_Length,
1911				 0, oflags & O_PARALLEL);
1912		if (msg != NULL) {
1913			doio_fprintf(stderr, "%s%s\n", msg,
1914#ifdef CRAY
1915				     format_rw(req, fd, addr, -1, Pattern,
1916					       &aiop->iosw)
1917#else
1918				     format_rw(req, fd, addr, -1, Pattern, NULL)
1919#endif
1920			    );
1921			doio_upanic(U_CORRUPTION);
1922			exit(E_COMPARE);
1923
1924		}
1925	}
1926
1927	/*
1928	 * General cleanup ...
1929	 *
1930	 * Write extent information to the write-log, so that doio_check can do
1931	 * corruption detection.  Note that w_done is set to 1, indicating that
1932	 * the write has been verified as complete.  We don't need to write the
1933	 * filename on the second logging.
1934	 */
1935
1936	if (w_opt && logged_write) {
1937		wrec.w_done = 1;
1938		wlog_record_write(&Wlog, &wrec, woffset);
1939	}
1940
1941	/*
1942	 * Unlock file region if necessary
1943	 */
1944
1945	if (got_lock) {
1946		if (lock_file_region(file, fd, F_UNLCK, offset, nbytes) < 0) {
1947			alloc_mem(-1);
1948			exit(E_INTERNAL);
1949		}
1950	}
1951
1952	return ((rval == -1) ? -1 : 0);
1953}
1954
1955/*
1956 * Simple routine to lock/unlock a file using fcntl()
1957 */
1958
1959int lock_file_region(char *fname, int fd, int type, int start, int nbytes)
1960{
1961	struct flock flk;
1962
1963	flk.l_type = type;
1964	flk.l_whence = 0;
1965	flk.l_start = start;
1966	flk.l_len = nbytes;
1967
1968	if (fcntl(fd, F_SETLKW, &flk) < 0) {
1969		doio_fprintf(stderr,
1970			     "fcntl(%d, %d, %#o) failed for file %s, lock type %d, offset %d, length %d:  %s (%d), open flags: %#o\n",
1971			     fd, F_SETLKW, &flk, fname, type,
1972			     start, nbytes, SYSERR, errno,
1973			     fcntl(fd, F_GETFL, 0));
1974		return -1;
1975	}
1976
1977	return 0;
1978}
1979
1980/*
1981 * Perform a listio request.
1982 */
1983
1984#ifdef CRAY
1985char *format_listio(struct io_req *ioreq, int lcmd, struct listreq *list,
1986		    int nent, int fd, char *pattern)
1987{
1988	static char *errbuf = NULL;
1989	struct listio_req *liop = &ioreq->r_data.listio;
1990	struct listreq *listreq;
1991	char *cp, *cmd, *opcode, *aio_strat;
1992	int i;
1993
1994	switch (lcmd) {
1995	case LC_START:
1996		cmd = "LC_START";
1997		break;
1998	case LC_WAIT:
1999		cmd = "LC_WAIT";
2000		break;
2001	default:
2002		cmd = "???";
2003		break;
2004	}
2005
2006	if (errbuf == NULL)
2007		errbuf = malloc(32768);
2008
2009	cp = errbuf;
2010	cp += sprintf(cp, "Request number %d\n", Reqno);
2011
2012	cp += sprintf(cp, "syscall:  listio(%s, %#o, %d)\n\n", cmd, list, nent);
2013
2014	aio_strat = format_strat(liop->r_aio_strat);
2015
2016	for (i = 0; i < nent; i++) {
2017		cp += sprintf(cp, "struct lioreq for request element %d\n", i);
2018		cp += sprintf(cp, "----------------------------------------\n");
2019
2020		listreq = list + i;
2021
2022		switch (listreq->li_opcode) {
2023		case LO_READ:
2024			opcode = "LO_READ";
2025			break;
2026		case LO_WRITE:
2027			opcode = "LO_WRITE";
2028			break;
2029		default:
2030			opcode = "???";
2031			break;
2032		}
2033
2034		cp += sprintf(cp, "          li_opcode =    %s\n", opcode);
2035		cp +=
2036		    sprintf(cp, "          li_drvr =      %#o\n",
2037			    listreq->li_drvr);
2038		cp +=
2039		    sprintf(cp, "          li_flags =     %#o\n",
2040			    listreq->li_flags);
2041		cp +=
2042		    sprintf(cp, "          li_offset =    %d\n",
2043			    listreq->li_offset);
2044		cp +=
2045		    sprintf(cp, "          li_fildes =    %d\n",
2046			    listreq->li_fildes);
2047		cp +=
2048		    sprintf(cp, "          li_buf =       %#o\n",
2049			    listreq->li_buf);
2050		cp +=
2051		    sprintf(cp, "          li_nbyte =     %d\n",
2052			    listreq->li_nbyte);
2053		cp +=
2054		    sprintf(cp, "          li_status =    %#o (%d, %d, %d)\n",
2055			    listreq->li_status, listreq->li_status->sw_flag,
2056			    listreq->li_status->sw_error,
2057			    listreq->li_status->sw_count);
2058		cp +=
2059		    sprintf(cp, "          li_signo =     %d\n",
2060			    listreq->li_signo);
2061		cp +=
2062		    sprintf(cp, "          li_nstride =   %d\n",
2063			    listreq->li_nstride);
2064		cp +=
2065		    sprintf(cp, "          li_filstride = %d\n",
2066			    listreq->li_filstride);
2067		cp +=
2068		    sprintf(cp, "          li_memstride = %d\n",
2069			    listreq->li_memstride);
2070		cp +=
2071		    sprintf(cp, "          io completion strategy is %s\n",
2072			    aio_strat);
2073	}
2074	return errbuf;
2075}
2076#endif /* CRAY */
2077
2078int do_listio(struct io_req *req)
2079{
2080#ifdef CRAY
2081	struct listio_req *lio;
2082	int fd, oflags, signo, nb, i;
2083	int logged_write, rval, got_lock;
2084	int aio_strat, aio_id;
2085	int min_byte, max_byte;
2086	int mem_needed;
2087	int foffset, fstride, mstride, nstrides;
2088	char *moffset;
2089	long offset, woffset;
2090	char *addr, *msg;
2091	sigset_t block_mask, omask;
2092	struct wlog_rec wrec;
2093	struct aio_info *aiop;
2094	struct listreq lio_req;
2095
2096	lio = &req->r_data.listio;
2097
2098	/*
2099	 * If bytes per stride is less than the stride size, drop the request
2100	 * since it will cause overlapping strides, and we cannot predict
2101	 * the order they will complete in.
2102	 */
2103
2104	if (lio->r_filestride && abs(lio->r_filestride) < lio->r_nbytes) {
2105		doio_fprintf(stderr,
2106			     "do_listio():  Bogus listio request - abs(filestride) [%d] < nbytes [%d]\n",
2107			     abs(lio->r_filestride), lio->r_nbytes);
2108		return -1;
2109	}
2110
2111	/*
2112	 * Allocate core memory.  Initialize the data to be written.  Make
2113	 * sure we get enough, based on the memstride.
2114	 */
2115
2116	mem_needed =
2117	    stride_bounds(0, lio->r_memstride, lio->r_nstrides,
2118			  lio->r_nbytes, NULL, NULL);
2119
2120	if ((rval = alloc_mem(mem_needed + wtob(1))) < 0) {
2121		return rval;
2122	}
2123
2124	/*
2125	 * Set the memory address pointer.  If the io is not raw, adjust
2126	 * addr by a random amount, so that non-raw io is not necessarily
2127	 * word aligned.
2128	 */
2129
2130	addr = Memptr;
2131
2132	if (!(lio->r_uflags & F_WORD_ALIGNED)) {
2133		addr += random_range(0, wtob(1) - 1, 1, NULL);
2134	}
2135
2136	if (lio->r_opcode == LO_WRITE) {
2137		Pattern[0] = lio->r_pattern;
2138		(*Data_Fill) (Memptr, mem_needed, Pattern, Pattern_Length, 0);
2139		if (addr != Memptr)
2140			memmove(addr, Memptr, mem_needed);
2141	}
2142
2143	/*
2144	 * Get a descriptor to do the io on.  No need to do an lseek, as this
2145	 * is encoded in the listio request.
2146	 */
2147
2148	if ((fd = alloc_fd(lio->r_file, lio->r_oflags)) == -1) {
2149		return -1;
2150	}
2151
2152	rval = -1;
2153	got_lock = 0;
2154	logged_write = 0;
2155
2156	/*
2157	 * If the opcode is LO_WRITE, lock all regions of the file that
2158	 * are touched by this listio request.  Currently, we use
2159	 * stride_bounds() to figure out the min and max bytes affected, and
2160	 * lock the entire region, regardless of the file stride.
2161	 */
2162
2163	if (lio->r_opcode == LO_WRITE && k_opt) {
2164		stride_bounds(lio->r_offset,
2165			      lio->r_filestride, lio->r_nstrides,
2166			      lio->r_nbytes, &min_byte, &max_byte);
2167
2168		if (lock_file_region(lio->r_file, fd, F_WRLCK,
2169				     min_byte, (max_byte - min_byte + 1)) < 0) {
2170			doio_fprintf(stderr,
2171				     "stride_bounds(%d, %d, %d, %d, ..., ...) set min_byte to %d, max_byte to %d\n",
2172				     lio->r_offset, lio->r_filestride,
2173				     lio->r_nstrides, lio->r_nbytes, min_byte,
2174				     max_byte);
2175			return -1;
2176		} else {
2177			got_lock = 1;
2178		}
2179	}
2180
2181	/*
2182	 * async write
2183	 */
2184
2185	aio_strat = lio->r_aio_strat;
2186	signo = (aio_strat == A_SIGNAL) ? SIGUSR1 : 0;
2187
2188	aio_id = aio_register(fd, aio_strat, signo);
2189	aiop = aio_slot(aio_id);
2190
2191	/*
2192	 * Form the listio request, and make the call.
2193	 */
2194
2195	lio_req.li_opcode = lio->r_opcode;
2196	lio_req.li_drvr = 0;
2197	lio_req.li_flags = LF_LSEEK;
2198	lio_req.li_offset = lio->r_offset;
2199	lio_req.li_fildes = fd;
2200
2201	if (lio->r_memstride >= 0 || lio->r_nstrides <= 1) {
2202		lio_req.li_buf = addr;
2203	} else {
2204		lio_req.li_buf = addr + mem_needed - lio->r_nbytes;
2205	}
2206
2207	lio_req.li_nbyte = lio->r_nbytes;
2208	lio_req.li_status = &aiop->iosw;
2209	lio_req.li_signo = signo;
2210	lio_req.li_nstride = lio->r_nstrides;
2211	lio_req.li_filstride = lio->r_filestride;
2212	lio_req.li_memstride = lio->r_memstride;
2213
2214	/*
2215	 * If signo != 0, block signo while we're in the system call, so that
2216	 * we don't get interrupted syscall failures.
2217	 */
2218
2219	if (signo) {
2220		sigemptyset(&block_mask);
2221		sigaddset(&block_mask, signo);
2222		sigprocmask(SIG_BLOCK, &block_mask, &omask);
2223	}
2224
2225	if (listio(lio->r_cmd, &lio_req, 1) < 0) {
2226		doio_fprintf(stderr,
2227			     "listio() failed: %s (%d)\n%s\n",
2228			     SYSERR, errno,
2229			     format_listio(req, lio->r_cmd, &lio_req, 1, fd,
2230					   Pattern));
2231		aio_unregister(aio_id);
2232		doio_upanic(U_RVAL);
2233		goto lio_done;
2234	}
2235
2236	if (signo) {
2237		sigprocmask(SIG_SETMASK, &omask, NULL);
2238	}
2239
2240	/*
2241	 * Wait for io to complete
2242	 */
2243
2244	aio_wait(aio_id);
2245
2246	nstrides = lio->r_nstrides ? lio->r_nstrides : 1;
2247	if (aiop->iosw.sw_count != lio->r_nbytes * nstrides) {
2248		doio_fprintf(stderr,
2249			     "Bad iosw from listio()\nExpected (%d,%d,%d), got (%d,%d,%d)\n%s\n",
2250			     1, 0, lio->r_nbytes * lio->r_nstrides,
2251			     aiop->iosw.sw_flag,
2252			     aiop->iosw.sw_error, aiop->iosw.sw_count,
2253			     format_listio(req, lio->r_cmd, &lio_req, 1, fd,
2254					   Pattern));
2255		aio_unregister(aio_id);
2256		doio_upanic(U_IOSW);
2257		goto lio_done;
2258	}
2259
2260	aio_unregister(aio_id);
2261
2262	/*
2263	 * Verify that the data was written correctly - check_file() returns
2264	 * a non-null pointer which contains an error message if there are
2265	 * problems.
2266	 *
2267	 * For listio, we basically have to make 1 call to check_file for each
2268	 * stride.
2269	 */
2270
2271	if (v_opt && lio_req.li_opcode == LO_WRITE) {
2272		fstride = lio->r_filestride ? lio->r_filestride : lio->r_nbytes;
2273		mstride = lio->r_memstride ? lio->r_memstride : lio->r_nbytes;
2274		foffset = lio->r_offset;
2275
2276		if (mstride > 0 || lio->r_nstrides <= 1) {
2277			moffset = addr;
2278		} else {
2279			moffset = addr + mem_needed - lio->r_nbytes;
2280		}
2281
2282		for (i = 0; i < lio_req.li_nstride; i++) {
2283			msg = check_file(lio->r_file,
2284					 foffset, lio->r_nbytes,
2285					 Pattern, Pattern_Length,
2286					 moffset - addr,
2287					 lio->r_oflags & O_PARALLEL);
2288
2289			if (msg != NULL) {
2290				doio_fprintf(stderr, "%s\n%s\n",
2291					     msg,
2292					     format_listio(req, lio->r_cmd,
2293							   &lio_req, 1, fd,
2294							   Pattern));
2295				doio_upanic(U_CORRUPTION);
2296				exit(E_COMPARE);
2297			}
2298
2299			moffset += mstride;
2300			foffset += fstride;
2301		}
2302
2303	}
2304
2305	rval = 0;
2306
2307lio_done:
2308
2309	/*
2310	 * General cleanup ...
2311	 *
2312	 */
2313
2314	/*
2315	 * Release file locks if necessary
2316	 */
2317
2318	if (got_lock) {
2319		if (lock_file_region(lio->r_file, fd, F_UNLCK,
2320				     min_byte, (max_byte - min_byte + 1)) < 0) {
2321			return -1;
2322		}
2323	}
2324
2325	return rval;
2326#else
2327	return -1;
2328#endif
2329}
2330
2331/*
2332 * perform ssread/sswrite operations
2333 */
2334
2335#ifdef _CRAY1
2336
2337int do_ssdio(struct io_req *req)
2338{
2339	int nbytes, nb;
2340	char errbuf[BSIZE];
2341
2342	nbytes = req->r_data.ssread.r_nbytes;
2343
2344	/*
2345	 * Grab core and sds space
2346	 */
2347
2348	if ((nb = alloc_mem(nbytes)) < 0)
2349		return nb;
2350
2351	if (alloc_sds(nbytes) == -1)
2352		return -1;
2353
2354	if (req->r_type == SSWRITE) {
2355
2356		/*
2357		 * Init data and ship it to the ssd
2358		 */
2359
2360		Pattern[0] = req->r_data.sswrite.r_pattern;
2361		/*pattern_fill(Memptr, nbytes, Pattern, Pattern_Length, 0); */
2362		(*Data_Fill) (Memptr, nbytes, Pattern, Pattern_Length, 0);
2363
2364		if (sswrite((long)Memptr, (long)Sdsptr, btoc(nbytes)) == -1) {
2365			doio_fprintf(stderr, "sswrite() failed:  %s (%d)\n%s\n",
2366				     SYSERR, errno,
2367				     format_sds(req, Memptr, Sdsptr, Pattern));
2368			doio_upanic(U_RVAL);
2369			return -1;
2370		}
2371	} else {
2372		/*
2373		 * read from sds
2374		 */
2375
2376		if (ssread((long)Memptr, (long)Sdsptr, btoc(nbytes)) == -1) {
2377			doio_fprintf(stderr, "ssread() failed: %s (%d)\n%s\n",
2378				     SYSERR, errno,
2379				     format_sds(req, Memptr, Sdsptr, Pattern));
2380
2381			doio_upanic(U_RVAL);
2382			return -1;
2383		}
2384	}
2385
2386	/*
2387	 * Verify data if SSWRITE and v_opt
2388	 */
2389
2390	if (v_opt && req->r_type == SSWRITE) {
2391		ssread((long)Memptr, (long)Sdsptr, btoc(nbytes));
2392
2393		if (pattern_check(Memptr, nbytes, Pattern, Pattern_Length, 0) ==
2394		    -1) {
2395			doio_fprintf(stderr,
2396				     "sds DATA COMPARE ERROR - ABORTING\n%s\n",
2397				     format_sds(req, Memptr, Sdsptr, Pattern));
2398
2399			doio_upanic(U_CORRUPTION);
2400			exit(E_COMPARE);
2401		}
2402	}
2403}
2404
2405#else
2406
2407#ifdef CRAY
2408
2409int do_ssdio(struct io_req *req)
2410{
2411	doio_fprintf(stderr,
2412		     "Internal Error - do_ssdio() called on a non-cray1 system\n");
2413	alloc_mem(-1);
2414	exit(E_INTERNAL);
2415}
2416
2417#endif /* CRAY */
2418
2419#endif /* _CRAY1 */
2420
2421char *fmt_ioreq(struct io_req *ioreq, struct syscall_info *sy, int fd)
2422{
2423	static char *errbuf = NULL;
2424	char *cp;
2425	struct rw_req *io;
2426	struct smap *aname;
2427#ifdef CRAY
2428	struct stat sbuf;
2429#endif
2430#ifdef sgi
2431	struct dioattr finfo;
2432#endif
2433
2434	if (errbuf == NULL)
2435		errbuf = malloc(32768);
2436
2437	io = &ioreq->r_data.io;
2438
2439	/*
2440	 * Look up async I/O completion strategy
2441	 */
2442	for (aname = aionames;
2443	     aname->value != -1 && aname->value != io->r_aio_strat; aname++) ;
2444
2445	cp = errbuf;
2446	cp += sprintf(cp, "Request number %d\n", Reqno);
2447
2448	cp +=
2449	    sprintf(cp, "          fd %d is file %s - open flags are %#o %s\n",
2450		    fd, io->r_file, io->r_oflags, format_oflags(io->r_oflags));
2451
2452	if (sy->sy_flags & SY_WRITE) {
2453		cp +=
2454		    sprintf(cp,
2455			    "          write done at file offset %d - pattern is %c (%#o)\n",
2456			    io->r_offset,
2457			    (io->r_pattern == '\0') ? '?' : io->r_pattern,
2458			    io->r_pattern);
2459	} else {
2460		cp += sprintf(cp, "          read done at file offset %d\n",
2461			      io->r_offset);
2462	}
2463
2464	if (sy->sy_flags & SY_ASYNC) {
2465		cp +=
2466		    sprintf(cp,
2467			    "          async io completion strategy is %s\n",
2468			    aname->string);
2469	}
2470
2471	cp +=
2472	    sprintf(cp,
2473		    "          number of requests is %d, strides per request is %d\n",
2474		    io->r_nent, io->r_nstrides);
2475
2476	cp += sprintf(cp, "          i/o byte count = %d\n", io->r_nbytes);
2477
2478	cp += sprintf(cp, "          memory alignment is %s\n",
2479		      (io->
2480		       r_uflags & F_WORD_ALIGNED) ? "aligned" : "unaligned");
2481
2482#ifdef CRAY
2483	if (io->r_oflags & O_RAW) {
2484		cp +=
2485		    sprintf(cp,
2486			    "          RAW I/O: offset %% 4096 = %d length %% 4096 = %d\n",
2487			    io->r_offset % 4096, io->r_nbytes % 4096);
2488		fstat(fd, &sbuf);
2489		cp +=
2490		    sprintf(cp,
2491			    "          optimal file xfer size: small: %d large: %d\n",
2492			    sbuf.st_blksize, sbuf.st_oblksize);
2493		cp +=
2494		    sprintf(cp, "          cblks %d cbits %#o\n", sbuf.st_cblks,
2495			    sbuf.st_cbits);
2496	}
2497#endif
2498#ifdef sgi
2499	if (io->r_oflags & O_DIRECT) {
2500
2501		if (fcntl(fd, F_DIOINFO, &finfo) == -1) {
2502			cp +=
2503			    sprintf(cp,
2504				    "          Error %s (%d) getting direct I/O info\n",
2505				    strerror(errno), errno);
2506			finfo.d_mem = 1;
2507			finfo.d_miniosz = 1;
2508			finfo.d_maxiosz = 1;
2509		}
2510
2511		cp +=
2512		    sprintf(cp,
2513			    "          DIRECT I/O: offset %% %d = %d length %% %d = %d\n",
2514			    finfo.d_miniosz, io->r_offset % finfo.d_miniosz,
2515			    io->r_nbytes, io->r_nbytes % finfo.d_miniosz);
2516		cp +=
2517		    sprintf(cp,
2518			    "          mem alignment 0x%x xfer size: small: %d large: %d\n",
2519			    finfo.d_mem, finfo.d_miniosz, finfo.d_maxiosz);
2520	}
2521#endif
2522
2523	return (errbuf);
2524}
2525
2526/*
2527 * Issue listio requests
2528 */
2529#ifdef CRAY
2530struct status *sy_listio(struct io_req *req, struct syscall_info *sysc, int fd,
2531			 char *addr)
2532{
2533	int offset, nbytes, nstrides, nents, aio_strat;
2534	int aio_id, signo, o, i, lc;
2535	char *a;
2536	struct listreq *lio_req, *l;
2537	struct aio_info *aiop;
2538	struct status *status;
2539
2540	/*
2541	 * Initialize common fields - assumes r_oflags, r_file, r_offset, and
2542	 * r_nbytes are at the same offset in the read_req and reada_req
2543	 * structures.
2544	 */
2545	offset = req->r_data.io.r_offset;
2546	nbytes = req->r_data.io.r_nbytes;
2547	nstrides = req->r_data.io.r_nstrides;
2548	nents = req->r_data.io.r_nent;
2549	aio_strat = req->r_data.io.r_aio_strat;
2550
2551	lc = (sysc->sy_flags & SY_ASYNC) ? LC_START : LC_WAIT;
2552
2553	status = malloc(sizeof(struct status));
2554	if (status == NULL) {
2555		doio_fprintf(stderr, "malloc failed, %s/%d\n",
2556			     __FILE__, __LINE__);
2557		return NULL;
2558	}
2559	status->aioid = malloc((nents + 1) * sizeof(int));
2560	if (status->aioid == NULL) {
2561		doio_fprintf(stderr, "malloc failed, %s/%d\n",
2562			     __FILE__, __LINE__);
2563		return NULL;
2564	}
2565
2566	signo = (aio_strat == A_SIGNAL) ? SIGUSR1 : 0;
2567
2568	lio_req = malloc(nents * sizeof(struct listreq));
2569	if (lio_req == NULL) {
2570		doio_fprintf(stderr, "malloc failed, %s/%d\n",
2571			     __FILE__, __LINE__);
2572		return NULL;
2573	}
2574	for (l = lio_req, a = addr, o = offset, i = 0;
2575	     i < nents; l++, a += nbytes, o += nbytes, i++) {
2576
2577		aio_id = aio_register(fd, aio_strat, signo);
2578		aiop = aio_slot(aio_id);
2579		status->aioid[i] = aio_id;
2580
2581		l->li_opcode = (sysc->sy_flags & SY_WRITE) ? LO_WRITE : LO_READ;
2582		l->li_offset = o;
2583		l->li_fildes = fd;
2584		l->li_buf = a;
2585		l->li_nbyte = nbytes;
2586		l->li_status = &aiop->iosw;
2587		l->li_signo = signo;
2588		l->li_nstride = nstrides;
2589		l->li_filstride = 0;
2590		l->li_memstride = 0;
2591		l->li_drvr = 0;
2592		l->li_flags = LF_LSEEK;
2593	}
2594
2595	status->aioid[nents] = -1;	/* end sentinel */
2596
2597	if ((status->rval = listio(lc, lio_req, nents)) == -1) {
2598		status->err = errno;
2599	}
2600
2601	free(lio_req);
2602	return (status);
2603}
2604
2605/*
2606 * Calculate the size of a request in bytes and min/max boundaries
2607 *
2608 * This assumes filestride & memstride = 0.
2609 */
2610int listio_mem(struct io_req *req, int offset, int fmstride, int *min, int *max)
2611{
2612	int i, size;
2613
2614	size = stride_bounds(offset, fmstride,
2615			     req->r_data.io.r_nstrides * req->r_data.io.r_nent,
2616			     req->r_data.io.r_nbytes, min, max);
2617	return (size);
2618}
2619
2620char *fmt_listio(struct io_req *req, struct syscall_info *sy, int fd,
2621		 char *addr)
2622{
2623	static char *errbuf = NULL;
2624	char *cp;
2625	char *c, *opcode;
2626	int i;
2627
2628	if (errbuf == NULL) {
2629		errbuf = malloc(32768);
2630		if (errbuf == NULL) {
2631			doio_fprintf(stderr, "malloc failed, %s/%d\n",
2632				     __FILE__, __LINE__);
2633			return NULL;
2634		}
2635	}
2636
2637	c = (sy->sy_flags & SY_ASYNC) ? "lc_wait" : "lc_start";
2638
2639	cp = errbuf;
2640	cp += sprintf(cp, "syscall:  listio(%s, (?), %d)\n",
2641		      c, req->r_data.io.r_nent);
2642
2643	cp += sprintf(cp, "          data buffer at %#o\n", addr);
2644
2645	return (errbuf);
2646}
2647#endif /* CRAY */
2648
2649#ifdef sgi
2650struct status *sy_pread(struct io_req *req, struct syscall_info *sysc, int fd,
2651			char *addr)
2652{
2653	int rc;
2654	struct status *status;
2655
2656	rc = pread(fd, addr, req->r_data.io.r_nbytes, req->r_data.io.r_offset);
2657
2658	status = malloc(sizeof(struct status));
2659	if (status == NULL) {
2660		doio_fprintf(stderr, "malloc failed, %s/%d\n",
2661			     __FILE__, __LINE__);
2662		return NULL;
2663	}
2664	status->aioid = NULL;
2665	status->rval = rc;
2666	status->err = errno;
2667
2668	return (status);
2669}
2670
2671struct status *sy_pwrite(struct io_req *req, struct syscall_info *sysc, int fd,
2672			 char *addr)
2673{
2674	int rc;
2675	struct status *status;
2676
2677	rc = pwrite(fd, addr, req->r_data.io.r_nbytes, req->r_data.io.r_offset);
2678
2679	status = malloc(sizeof(struct status));
2680	if (status == NULL) {
2681		doio_fprintf(stderr, "malloc failed, %s/%d\n",
2682			     __FILE__, __LINE__);
2683		return NULL;
2684	}
2685	status->aioid = NULL;
2686	status->rval = rc;
2687	status->err = errno;
2688
2689	return (status);
2690}
2691
2692char *fmt_pread(struct io_req *req, struct syscall_info *sy, int fd, char *addr)
2693{
2694	static char *errbuf = NULL;
2695	char *cp;
2696
2697	if (errbuf == NULL) {
2698		errbuf = malloc(32768);
2699		if (errbuf == NULL) {
2700			doio_fprintf(stderr, "malloc failed, %s/%d\n",
2701				     __FILE__, __LINE__);
2702			return NULL;
2703		}
2704	}
2705
2706	cp = errbuf;
2707	cp += sprintf(cp, "syscall:  %s(%d, 0x%lx, %d)\n",
2708		      sy->sy_name, fd, addr, req->r_data.io.r_nbytes);
2709	return (errbuf);
2710}
2711#endif /* sgi */
2712
2713#ifndef CRAY
2714struct status *sy_readv(struct io_req *req, struct syscall_info *sysc, int fd,
2715			char *addr)
2716{
2717	struct status *sy_rwv();
2718	return sy_rwv(req, sysc, fd, addr, 0);
2719}
2720
2721struct status *sy_writev(struct io_req *req, struct syscall_info *sysc, int fd,
2722			 char *addr)
2723{
2724	struct status *sy_rwv();
2725	return sy_rwv(req, sysc, fd, addr, 1);
2726}
2727
2728struct status *sy_rwv(struct io_req *req, struct syscall_info *sysc, int fd,
2729		      char *addr, int rw)
2730{
2731	int rc;
2732	struct status *status;
2733	struct iovec iov[2];
2734
2735	status = malloc(sizeof(struct status));
2736	if (status == NULL) {
2737		doio_fprintf(stderr, "malloc failed, %s/%d\n",
2738			     __FILE__, __LINE__);
2739		return NULL;
2740	}
2741	status->aioid = NULL;
2742
2743	/* move to the desired file position. */
2744	if ((rc = lseek(fd, req->r_data.io.r_offset, SEEK_SET)) == -1) {
2745		status->rval = rc;
2746		status->err = errno;
2747		return (status);
2748	}
2749
2750	iov[0].iov_base = addr;
2751	iov[0].iov_len = req->r_data.io.r_nbytes;
2752
2753	if (rw)
2754		rc = writev(fd, iov, 1);
2755	else
2756		rc = readv(fd, iov, 1);
2757	status->aioid = NULL;
2758	status->rval = rc;
2759	status->err = errno;
2760	return (status);
2761}
2762
2763char *fmt_readv(struct io_req *req, struct syscall_info *sy, int fd, char *addr)
2764{
2765	static char errbuf[32768];
2766	char *cp;
2767
2768	cp = errbuf;
2769	cp += sprintf(cp, "syscall:  %s(%d, (iov on stack), 1)\n",
2770		      sy->sy_name, fd);
2771	return (errbuf);
2772}
2773#endif /* !CRAY */
2774
2775#ifdef sgi
2776struct status *sy_aread(struct io_req *req, struct syscall_info *sysc, int fd,
2777			char *addr)
2778{
2779	struct status *sy_arw();
2780	return sy_arw(req, sysc, fd, addr, 0);
2781}
2782
2783struct status *sy_awrite(struct io_req *req, struct syscall_info *sysc, int fd,
2784			 char *addr)
2785{
2786	struct status *sy_arw();
2787	return sy_arw(req, sysc, fd, addr, 1);
2788}
2789
2790/*
2791  #define sy_aread(A, B, C, D)	sy_arw(A, B, C, D, 0)
2792  #define sy_awrite(A, B, C, D)	sy_arw(A, B, C, D, 1)
2793 */
2794
2795struct status *sy_arw(struct io_req *req, struct syscall_info *sysc, int fd,
2796		      char *addr, int rw)
2797{
2798	/* POSIX 1003.1b-1993 Async read */
2799	struct status *status;
2800	int rc;
2801	int aio_id, aio_strat, signo;
2802	struct aio_info *aiop;
2803
2804	status = malloc(sizeof(struct status));
2805	if (status == NULL) {
2806		doio_fprintf(stderr, "malloc failed, %s/%d\n",
2807			     __FILE__, __LINE__);
2808		return NULL;
2809	}
2810	aio_strat = req->r_data.io.r_aio_strat;
2811	signo = (aio_strat == A_SIGNAL) ? SIGUSR1 : 0;
2812
2813	aio_id = aio_register(fd, aio_strat, signo);
2814	aiop = aio_slot(aio_id);
2815
2816	memset((void *)&aiop->aiocb, 0, sizeof(aiocb_t));
2817
2818	aiop->aiocb.aio_fildes = fd;
2819	aiop->aiocb.aio_nbytes = req->r_data.io.r_nbytes;
2820	aiop->aiocb.aio_offset = req->r_data.io.r_offset;
2821	aiop->aiocb.aio_buf = addr;
2822	aiop->aiocb.aio_reqprio = 0;	/* must be 0 */
2823	aiop->aiocb.aio_lio_opcode = 0;
2824
2825	if (aio_strat == A_SIGNAL) {	/* siginfo(2) stuff */
2826		aiop->aiocb.aio_sigevent.sigev_notify = SIGEV_SIGNAL;
2827		aiop->aiocb.aio_sigevent.sigev_signo = signo;
2828	} else if (aio_strat == A_CALLBACK) {
2829		aiop->aiocb.aio_sigevent.sigev_signo = 0;
2830		aiop->aiocb.aio_sigevent.sigev_notify = SIGEV_CALLBACK;
2831		aiop->aiocb.aio_sigevent.sigev_func = cb_handler;
2832		aiop->aiocb.aio_sigevent.sigev_value.sival_int = aio_id;
2833	} else {
2834		aiop->aiocb.aio_sigevent.sigev_notify = SIGEV_NONE;
2835		aiop->aiocb.aio_sigevent.sigev_signo = 0;
2836	}
2837
2838	if (rw)
2839		rc = aio_write(&aiop->aiocb);
2840	else
2841		rc = aio_read(&aiop->aiocb);
2842
2843	status->aioid = malloc(2 * sizeof(int));
2844	if (status->aioid == NULL) {
2845		doio_fprintf(stderr, "malloc failed, %s/%d\n",
2846			     __FILE__, __LINE__);
2847		return NULL;
2848	}
2849	status->aioid[0] = aio_id;
2850	status->aioid[1] = -1;
2851	status->rval = rc;
2852	status->err = errno;
2853	return (status);
2854}
2855
2856char *fmt_aread(struct io_req *req, struct syscall_info *sy, int fd, char *addr)
2857{
2858	static char errbuf[32768];
2859	char *cp;
2860
2861	cp = errbuf;
2862	cp += sprintf(cp, "syscall:  %s(&aiop->aiocb)\n", sy->sy_name);
2863	return (errbuf);
2864}
2865#endif /* sgi */
2866
2867#ifndef CRAY
2868
2869struct status *sy_mmread(struct io_req *req, struct syscall_info *sysc, int fd,
2870			 char *addr)
2871{
2872	struct status *sy_mmrw();
2873	return sy_mmrw(req, sysc, fd, addr, 0);
2874}
2875
2876struct status *sy_mmwrite(struct io_req *req, struct syscall_info *sysc, int fd,
2877			  char *addr)
2878{
2879	struct status *sy_mmrw();
2880	return sy_mmrw(req, sysc, fd, addr, 1);
2881}
2882
2883struct status *sy_mmrw(struct io_req *req, struct syscall_info *sysc, int fd,
2884		       char *addr, int rw)
2885{
2886	/*
2887	 * mmap read/write
2888	 * This version is oriented towards mmaping the file to memory
2889	 * ONCE and keeping it mapped.
2890	 */
2891	struct status *status;
2892	void *mrc = NULL, *memaddr = NULL;
2893	struct fd_cache *fdc;
2894	struct stat sbuf;
2895	int rc;
2896
2897	status = malloc(sizeof(struct status));
2898	if (status == NULL) {
2899		doio_fprintf(stderr, "malloc failed, %s/%d\n",
2900			     __FILE__, __LINE__);
2901		return NULL;
2902	}
2903	status->aioid = NULL;
2904	status->rval = -1;
2905
2906	fdc = alloc_fdcache(req->r_data.io.r_file, req->r_data.io.r_oflags);
2907
2908	if (v_opt || fdc->c_memaddr == NULL) {
2909		if (fstat(fd, &sbuf) < 0) {
2910			doio_fprintf(stderr, "fstat failed, errno=%d\n", errno);
2911			status->err = errno;
2912			return (status);
2913		}
2914
2915		fdc->c_memlen = (int)sbuf.st_size;
2916		mrc = mmap(NULL, (int)sbuf.st_size,
2917			   rw ? PROT_WRITE | PROT_READ : PROT_READ,
2918			   MAP_SHARED, fd, 0);
2919
2920		if (mrc == MAP_FAILED) {
2921			doio_fprintf(stderr, "mmap() failed - 0x%lx %d\n",
2922				     mrc, errno);
2923			status->err = errno;
2924			return (status);
2925		}
2926
2927		fdc->c_memaddr = mrc;
2928	}
2929
2930	memaddr = (void *)((char *)fdc->c_memaddr + req->r_data.io.r_offset);
2931
2932	active_mmap_rw = 1;
2933	if (rw)
2934		memcpy(memaddr, addr, req->r_data.io.r_nbytes);
2935	else
2936		memcpy(addr, memaddr, req->r_data.io.r_nbytes);
2937	if (v_opt)
2938		msync(fdc->c_memaddr, (int)sbuf.st_size, MS_SYNC);
2939	active_mmap_rw = 0;
2940
2941	status->rval = req->r_data.io.r_nbytes;
2942	status->err = 0;
2943
2944	if (v_opt) {
2945		rc = munmap(mrc, (int)sbuf.st_size);
2946	}
2947
2948	return (status);
2949}
2950
2951char *fmt_mmrw(struct io_req *req, struct syscall_info *sy, int fd, char *addr)
2952{
2953	static char errbuf[32768];
2954	char *cp;
2955	struct fd_cache *fdc;
2956	void *memaddr;
2957
2958	fdc = alloc_fdcache(req->r_data.io.r_file, req->r_data.io.r_oflags);
2959
2960	cp = errbuf;
2961	cp += sprintf(cp, "syscall:  %s(NULL, %d, %s, MAP_SHARED, %d, 0)\n",
2962		      sy->sy_name,
2963		      fdc->c_memlen,
2964		      (sy->sy_flags & SY_WRITE) ? "PROT_WRITE" : "PROT_READ",
2965		      fd);
2966
2967	cp += sprintf(cp, "\tfile is mmaped to: 0x%lx\n",
2968		      (unsigned long)fdc->c_memaddr);
2969
2970	memaddr = (void *)((char *)fdc->c_memaddr + req->r_data.io.r_offset);
2971
2972	cp += sprintf(cp, "\tfile-mem=0x%lx, length=%d, buffer=0x%lx\n",
2973		      (unsigned long)memaddr, req->r_data.io.r_nbytes,
2974		      (unsigned long)addr);
2975
2976	return (errbuf);
2977}
2978#endif /* !CRAY */
2979
2980struct syscall_info syscalls[] = {
2981#ifdef CRAY
2982	{"listio-read-sync", LREAD,
2983	 sy_listio, NULL, fmt_listio,
2984	 SY_IOSW},
2985	{"listio-read-strides-sync", LSREAD,
2986	 sy_listio, listio_mem, fmt_listio,
2987	 SY_IOSW},
2988	{"listio-read-reqs-sync", LEREAD,
2989	 sy_listio, listio_mem, fmt_listio,
2990	 SY_IOSW},
2991	{"listio-read-async", LREADA,
2992	 sy_listio, NULL, fmt_listio,
2993	 SY_IOSW | SY_ASYNC},
2994	{"listio-read-strides-async", LSREADA,
2995	 sy_listio, listio_mem, fmt_listio,
2996	 SY_IOSW | SY_ASYNC},
2997	{"listio-read-reqs-async", LEREADA,
2998	 sy_listio, listio_mem, fmt_listio,
2999	 SY_IOSW | SY_ASYNC},
3000	{"listio-write-sync", LWRITE,
3001	 sy_listio, listio_mem, fmt_listio,
3002	 SY_IOSW | SY_WRITE},
3003	{"listio-write-strides-sync", LSWRITE,
3004	 sy_listio, listio_mem, fmt_listio,
3005	 SY_IOSW | SY_WRITE},
3006	{"listio-write-reqs-sync", LEWRITE,
3007	 sy_listio, listio_mem, fmt_listio,
3008	 SY_IOSW | SY_WRITE},
3009	{"listio-write-async", LWRITEA,
3010	 sy_listio, listio_mem, fmt_listio,
3011	 SY_IOSW | SY_WRITE | SY_ASYNC},
3012	{"listio-write-strides-async", LSWRITEA,
3013	 sy_listio, listio_mem, fmt_listio,
3014	 SY_IOSW | SY_WRITE | SY_ASYNC},
3015	{"listio-write-reqs-async", LEWRITEA,
3016	 sy_listio, listio_mem, fmt_listio,
3017	 SY_IOSW | SY_WRITE | SY_ASYNC},
3018#endif
3019
3020#ifdef sgi
3021	{"aread", AREAD,
3022	 sy_aread, NULL, fmt_aread,
3023	 SY_IOSW | SY_ASYNC},
3024	{"awrite", AWRITE,
3025	 sy_awrite, NULL, fmt_aread,
3026	 SY_IOSW | SY_WRITE | SY_ASYNC},
3027	{"pread", PREAD,
3028	 sy_pread, NULL, fmt_pread,
3029	 0},
3030	{"pwrite", PWRITE,
3031	 sy_pwrite, NULL, fmt_pread,
3032	 SY_WRITE},
3033#endif
3034
3035#ifndef CRAY
3036	{"readv", READV,
3037	 sy_readv, NULL, fmt_readv,
3038	 0},
3039	{"writev", WRITEV,
3040	 sy_writev, NULL, fmt_readv,
3041	 SY_WRITE},
3042	{"mmap-read", MMAPR,
3043	 sy_mmread, NULL, fmt_mmrw,
3044	 0},
3045	{"mmap-write", MMAPW,
3046	 sy_mmwrite, NULL, fmt_mmrw,
3047	 SY_WRITE},
3048#endif
3049
3050	{NULL, 0,
3051	 0, 0, 0,
3052	 0},
3053};
3054
3055int do_rw(struct io_req *req)
3056{
3057	static int pid = -1;
3058	int fd, offset, nbytes, nstrides, nents, oflags;
3059	int rval, mem_needed, i;
3060	int logged_write, got_lock, pattern;
3061	off_t woffset;
3062	int min_byte, max_byte;
3063	char *addr, *file, *msg;
3064	struct status *s;
3065	struct wlog_rec wrec;
3066	struct syscall_info *sy;
3067#if defined(CRAY) || defined(sgi)
3068	struct aio_info *aiop;
3069	struct iosw *iosw;
3070#endif
3071#ifdef sgi
3072	struct fd_cache *fdc;
3073#endif
3074
3075	woffset = 0;
3076
3077	/*
3078	 * Initialize common fields - assumes r_oflags, r_file, r_offset, and
3079	 * r_nbytes are at the same offset in the read_req and reada_req
3080	 * structures.
3081	 */
3082	file = req->r_data.io.r_file;
3083	oflags = req->r_data.io.r_oflags;
3084	offset = req->r_data.io.r_offset;
3085	nbytes = req->r_data.io.r_nbytes;
3086	nstrides = req->r_data.io.r_nstrides;
3087	nents = req->r_data.io.r_nent;
3088	pattern = req->r_data.io.r_pattern;
3089
3090	if (nents >= MAX_AIO) {
3091		doio_fprintf(stderr,
3092			     "do_rw: too many list requests, %d.  Maximum is %d\n",
3093			     nents, MAX_AIO);
3094		return (-1);
3095	}
3096
3097	/*
3098	 * look up system call info
3099	 */
3100	for (sy = syscalls; sy->sy_name != NULL && sy->sy_type != req->r_type;
3101	     sy++) ;
3102
3103	if (sy->sy_name == NULL) {
3104		doio_fprintf(stderr, "do_rw: unknown r_type %d.\n",
3105			     req->r_type);
3106		return (-1);
3107	}
3108
3109	/*
3110	 * Get an open file descriptor
3111	 * Note: must be done before memory allocation so that the direct i/o
3112	 *      information is available in mem. allocate
3113	 */
3114
3115	if ((fd = alloc_fd(file, oflags)) == -1)
3116		return -1;
3117
3118	/*
3119	 * Allocate core memory and possibly sds space.  Initialize the
3120	 * data to be written.  Make sure we get enough, based on the
3121	 * memstride.
3122	 *
3123	 * need:
3124	 *      1 extra word for possible partial-word address "bump"
3125	 *      1 extra word for dynamic pattern overrun
3126	 *      MPP_BUMP extra words for T3E non-hw-aligned memory address.
3127	 */
3128
3129	if (sy->sy_buffer != NULL) {
3130		mem_needed = (*sy->sy_buffer) (req, 0, 0, NULL, NULL);
3131	} else {
3132		mem_needed = nbytes;
3133	}
3134
3135#ifdef CRAY
3136	if ((rval =
3137	     alloc_mem(mem_needed + wtob(1) * 2 +
3138		       MPP_BUMP * sizeof(UINT64_T))) < 0) {
3139		return rval;
3140	}
3141#else
3142#ifdef sgi
3143	/* get memory alignment for using DIRECT I/O */
3144	fdc = alloc_fdcache(file, oflags);
3145
3146	if ((rval = alloc_mem(mem_needed + wtob(1) * 2 + fdc->c_memalign)) < 0) {
3147		return rval;
3148	}
3149#else
3150	/* what is !CRAY && !sgi ? */
3151	if ((rval = alloc_mem(mem_needed + wtob(1) * 2)) < 0) {
3152		return rval;
3153	}
3154#endif /* sgi */
3155#endif /* CRAY */
3156
3157	Pattern[0] = pattern;
3158
3159	/*
3160	 * Allocate SDS space for backdoor write if desired
3161	 */
3162
3163	if (oflags & O_SSD) {
3164#ifdef CRAY
3165#ifndef _CRAYMPP
3166		if (alloc_sds(nbytes) == -1)
3167			return -1;
3168
3169		if (sy->sy_flags & SY_WRITE) {
3170			/*pattern_fill(Memptr, mem_needed, Pattern, Pattern_Length, 0); */
3171			(*Data_Fill) (Memptr, nbytes, Pattern, Pattern_Length,
3172				      0);
3173
3174			if (sswrite((long)Memptr, Sdsptr, btoc(mem_needed)) ==
3175			    -1) {
3176				doio_fprintf(stderr,
3177					     "sswrite(%d, %d, %d) failed:  %s (%d)\n",
3178					     (long)Memptr, Sdsptr,
3179					     btoc(mem_needed), SYSERR, errno);
3180				fflush(stderr);
3181				return -1;
3182			}
3183		}
3184
3185		addr = (char *)Sdsptr;
3186#else
3187		doio_fprintf(stderr,
3188			     "Invalid O_SSD flag was generated for MPP system\n");
3189		fflush(stderr);
3190		return -1;
3191#endif /* _CRAYMPP */
3192#else /* CRAY */
3193		doio_fprintf(stderr,
3194			     "Invalid O_SSD flag was generated for non-Cray system\n");
3195		fflush(stderr);
3196		return -1;
3197#endif /* CRAY */
3198	} else {
3199		addr = Memptr;
3200
3201		/*
3202		 * if io is not raw, bump the offset by a random amount
3203		 * to generate non-word-aligned io.
3204		 *
3205		 * On MPP systems, raw I/O must start on an 0x80 byte boundary.
3206		 * For non-aligned I/O, bump the address from 1 to 8 words.
3207		 */
3208
3209		if (!(req->r_data.io.r_uflags & F_WORD_ALIGNED)) {
3210#ifdef _CRAYMPP
3211			addr +=
3212			    random_range(0, MPP_BUMP, 1, NULL) * sizeof(int);
3213#endif
3214			addr += random_range(0, wtob(1) - 1, 1, NULL);
3215		}
3216#ifdef sgi
3217		/*
3218		 * Force memory alignment for Direct I/O
3219		 */
3220		if ((oflags & O_DIRECT) && ((long)addr % fdc->c_memalign != 0)) {
3221			addr +=
3222			    fdc->c_memalign - ((long)addr % fdc->c_memalign);
3223		}
3224#endif
3225
3226		/*
3227		 * FILL must be done on a word-aligned buffer.
3228		 * Call the fill function with Memptr which is aligned,
3229		 * then memmove it to the right place.
3230		 */
3231		if (sy->sy_flags & SY_WRITE) {
3232			(*Data_Fill) (Memptr, mem_needed, Pattern,
3233				      Pattern_Length, 0);
3234			if (addr != Memptr)
3235				memmove(addr, Memptr, mem_needed);
3236		}
3237	}
3238
3239	rval = 0;
3240	got_lock = 0;
3241	logged_write = 0;
3242
3243	/*
3244	 * Lock data if this is a write and locking option is set
3245	 */
3246	if (sy->sy_flags & SY_WRITE && k_opt) {
3247		if (sy->sy_buffer != NULL) {
3248			(*sy->sy_buffer) (req, offset, 0, &min_byte, &max_byte);
3249		} else {
3250			min_byte = offset;
3251			max_byte = offset + (nbytes * nstrides * nents);
3252		}
3253
3254		if (lock_file_region(file, fd, F_WRLCK,
3255				     min_byte, (max_byte - min_byte + 1)) < 0) {
3256			doio_fprintf(stderr,
3257				     "file lock failed:\n%s\n",
3258				     fmt_ioreq(req, sy, fd));
3259			doio_fprintf(stderr,
3260				     "          buffer(req, %d, 0, 0x%x, 0x%x)\n",
3261				     offset, min_byte, max_byte);
3262			alloc_mem(-1);
3263			exit(E_INTERNAL);
3264		}
3265
3266		got_lock = 1;
3267	}
3268
3269	/*
3270	 * Write a preliminary write-log entry.  This is done so that
3271	 * doio_check can do corruption detection across an interrupt/crash.
3272	 * Note that w_done is set to 0.  If doio_check sees this, it
3273	 * re-creates the file extents as if the write completed, but does not
3274	 * do any checking - see comments in doio_check for more details.
3275	 */
3276
3277	if (sy->sy_flags & SY_WRITE && w_opt) {
3278		if (pid == -1) {
3279			pid = getpid();
3280		}
3281
3282		wrec.w_async = (sy->sy_flags & SY_ASYNC) ? 1 : 0;
3283		wrec.w_oflags = oflags;
3284		wrec.w_pid = pid;
3285		wrec.w_offset = offset;
3286		wrec.w_nbytes = nbytes;	/* mem_needed -- total length */
3287
3288		wrec.w_pathlen = strlen(file);
3289		memcpy(wrec.w_path, file, wrec.w_pathlen);
3290		wrec.w_hostlen = strlen(Host);
3291		memcpy(wrec.w_host, Host, wrec.w_hostlen);
3292		wrec.w_patternlen = Pattern_Length;
3293		memcpy(wrec.w_pattern, Pattern, wrec.w_patternlen);
3294
3295		wrec.w_done = 0;
3296
3297		if ((woffset = wlog_record_write(&Wlog, &wrec, -1)) == -1) {
3298			doio_fprintf(stderr,
3299				     "Could not append to write-log:  %s (%d)\n",
3300				     SYSERR, errno);
3301		} else {
3302			logged_write = 1;
3303		}
3304	}
3305
3306	s = (*sy->sy_syscall) (req, sy, fd, addr);
3307
3308	if (s->rval == -1) {
3309		doio_fprintf(stderr,
3310			     "%s() request failed:  %s (%d)\n%s\n%s\n",
3311			     sy->sy_name, SYSERR, errno,
3312			     fmt_ioreq(req, sy, fd),
3313			     (*sy->sy_format) (req, sy, fd, addr));
3314
3315		doio_upanic(U_RVAL);
3316
3317		for (i = 0; i < nents; i++) {
3318			if (s->aioid == NULL)
3319				break;
3320			aio_unregister(s->aioid[i]);
3321		}
3322		rval = -1;
3323	} else {
3324		/*
3325		 * If the syscall was async, wait for I/O to complete
3326		 */
3327#ifndef __linux__
3328		if (sy->sy_flags & SY_ASYNC) {
3329			for (i = 0; i < nents; i++) {
3330				aio_wait(s->aioid[i]);
3331			}
3332		}
3333#endif
3334
3335		/*
3336		 * Check the syscall how-much-data-written return.  Look
3337		 * for this in either the return value or the 'iosw'
3338		 * structure.
3339		 */
3340
3341		if (sy->sy_flags & SY_IOSW) {
3342#ifdef CRAY
3343			for (i = 0; i < nents; i++) {
3344				if (s->aioid == NULL)
3345					break;	/* >>> error condition? */
3346				aiop = aio_slot(s->aioid[i]);
3347				iosw = &aiop->iosw;
3348				if (iosw->sw_error != 0) {
3349					doio_fprintf(stderr,
3350						     "%s() iosw error set: %s\n%s\n%s\n",
3351						     sy->sy_name,
3352						     strerror(iosw->sw_error),
3353						     fmt_ioreq(req, sy, fd),
3354						     (*sy->sy_format) (req, sy,
3355								       fd,
3356								       addr));
3357					doio_upanic(U_IOSW);
3358					rval = -1;
3359				} else if (iosw->sw_count != nbytes * nstrides) {
3360					doio_fprintf(stderr,
3361						     "Bad iosw from %s() #%d\nExpected (%d,%d,%d), got (%d,%d,%d)\n%s\n%s\n",
3362						     sy->sy_name, i,
3363						     1, 0, nbytes * nstrides,
3364						     iosw->sw_flag,
3365						     iosw->sw_error,
3366						     iosw->sw_count,
3367						     fmt_ioreq(req, sy, fd),
3368						     (*sy->sy_format) (req, sy,
3369								       fd,
3370								       addr));
3371					doio_upanic(U_IOSW);
3372					rval = -1;
3373				}
3374
3375				aio_unregister(s->aioid[i]);
3376			}
3377#endif /* CRAY */
3378#ifdef sgi
3379			for (i = 0; s->aioid[i] != -1; i++) {
3380				if (s->aioid == NULL) {
3381					doio_fprintf(stderr,
3382						     "aioid == NULL!\n");
3383					break;
3384				}
3385				aiop = aio_slot(s->aioid[i]);
3386
3387				/*
3388				 * make sure the io completed without error
3389				 */
3390				if (aiop->aio_errno != 0) {
3391					doio_fprintf(stderr,
3392						     "%s() aio error set: %s (%d)\n%s\n%s\n",
3393						     sy->sy_name,
3394						     strerror(aiop->aio_errno),
3395						     aiop->aio_errno,
3396						     fmt_ioreq(req, sy, fd),
3397						     (*sy->sy_format) (req, sy,
3398								       fd,
3399								       addr));
3400					doio_upanic(U_IOSW);
3401					rval = -1;
3402				} else if (aiop->aio_ret != nbytes) {
3403					doio_fprintf(stderr,
3404						     "Bad aio return from %s() #%d\nExpected (%d,%d), got (%d,%d)\n%s\n%s\n",
3405						     sy->sy_name, i,
3406						     0, nbytes,
3407						     aiop->aio_errno,
3408						     aiop->aio_ret,
3409						     fmt_ioreq(req, sy, fd),
3410						     (*sy->sy_format) (req, sy,
3411								       fd,
3412								       addr));
3413					aio_unregister(s->aioid[i]);
3414					doio_upanic(U_IOSW);
3415					return -1;
3416				} else {
3417					aio_unregister(s->aioid[i]);
3418					rval = 0;
3419				}
3420			}
3421#endif /* sgi */
3422		} else {
3423
3424			if (s->rval != mem_needed) {
3425				doio_fprintf(stderr,
3426					     "%s() request returned wrong # of bytes - expected %d, got %d\n%s\n%s\n",
3427					     sy->sy_name, nbytes, s->rval,
3428					     fmt_ioreq(req, sy, fd),
3429					     (*sy->sy_format) (req, sy, fd,
3430							       addr));
3431				rval = -1;
3432				doio_upanic(U_RVAL);
3433			}
3434		}
3435	}
3436
3437	/*
3438	 * Verify that the data was written correctly - check_file() returns
3439	 * a non-null pointer which contains an error message if there are
3440	 * problems.
3441	 */
3442
3443	if (rval == 0 && sy->sy_flags & SY_WRITE && v_opt) {
3444		msg = check_file(file, offset, nbytes * nstrides * nents,
3445				 Pattern, Pattern_Length, 0,
3446				 oflags & O_PARALLEL);
3447		if (msg != NULL) {
3448			doio_fprintf(stderr, "%s\n%s\n%s\n",
3449				     msg,
3450				     fmt_ioreq(req, sy, fd),
3451				     (*sy->sy_format) (req, sy, fd, addr));
3452			doio_upanic(U_CORRUPTION);
3453			exit(E_COMPARE);
3454		}
3455	}
3456
3457	/*
3458	 * General cleanup ...
3459	 *
3460	 * Write extent information to the write-log, so that doio_check can do
3461	 * corruption detection.  Note that w_done is set to 1, indicating that
3462	 * the write has been verified as complete.  We don't need to write the
3463	 * filename on the second logging.
3464	 */
3465
3466	if (w_opt && logged_write) {
3467		wrec.w_done = 1;
3468		wlog_record_write(&Wlog, &wrec, woffset);
3469	}
3470
3471	/*
3472	 * Unlock file region if necessary
3473	 */
3474
3475	if (got_lock) {
3476		if (lock_file_region(file, fd, F_UNLCK,
3477				     min_byte, (max_byte - min_byte + 1)) < 0) {
3478			alloc_mem(-1);
3479			exit(E_INTERNAL);
3480		}
3481	}
3482
3483	if (s->aioid != NULL)
3484		free(s->aioid);
3485	free(s);
3486	return (rval == -1) ? -1 : 0;
3487}
3488
3489/*
3490 * fcntl-based requests
3491 *   - F_FRESVSP
3492 *   - F_UNRESVSP
3493 *   - F_FSYNC
3494 */
3495#ifdef sgi
3496int do_fcntl(struct io_req *req)
3497{
3498	int fd, oflags, offset, nbytes;
3499	int rval, op;
3500	int got_lock;
3501	int min_byte, max_byte;
3502	char *file, *msg;
3503	struct flock flk;
3504
3505	/*
3506	 * Initialize common fields - assumes r_oflags, r_file, r_offset, and
3507	 * r_nbytes are at the same offset in the read_req and reada_req
3508	 * structures.
3509	 */
3510	file = req->r_data.io.r_file;
3511	oflags = req->r_data.io.r_oflags;
3512	offset = req->r_data.io.r_offset;
3513	nbytes = req->r_data.io.r_nbytes;
3514
3515	flk.l_type = 0;
3516	flk.l_whence = SEEK_SET;
3517	flk.l_start = offset;
3518	flk.l_len = nbytes;
3519
3520	/*
3521	 * Get an open file descriptor
3522	 */
3523
3524	if ((fd = alloc_fd(file, oflags)) == -1)
3525		return -1;
3526
3527	rval = 0;
3528	got_lock = 0;
3529
3530	/*
3531	 * Lock data if this is locking option is set
3532	 */
3533	if (k_opt) {
3534		min_byte = offset;
3535		max_byte = offset + nbytes;
3536
3537		if (lock_file_region(file, fd, F_WRLCK,
3538				     min_byte, (nbytes + 1)) < 0) {
3539			doio_fprintf(stderr, "file lock failed:\n");
3540			doio_fprintf(stderr,
3541				     "          buffer(req, %d, 0, 0x%x, 0x%x)\n",
3542				     offset, min_byte, max_byte);
3543			alloc_mem(-1);
3544			exit(E_INTERNAL);
3545		}
3546
3547		got_lock = 1;
3548	}
3549
3550	switch (req->r_type) {
3551	case RESVSP:
3552		op = F_RESVSP;
3553		msg = "f_resvsp";
3554		break;
3555	case UNRESVSP:
3556		op = F_UNRESVSP;
3557		msg = "f_unresvsp";
3558		break;
3559#ifdef F_FSYNC
3560	case DFFSYNC:
3561		op = F_FSYNC;
3562		msg = "f_fsync";
3563		break;
3564#endif
3565	}
3566
3567	rval = fcntl(fd, op, &flk);
3568
3569	if (rval == -1) {
3570		doio_fprintf(stderr,
3571			     "fcntl %s request failed: %s (%d)\n\tfcntl(%d, %s %d, {%d %lld ==> %lld}\n",
3572			     msg, SYSERR, errno,
3573			     fd, msg, op, flk.l_whence,
3574			     (long long)flk.l_start, (long long)flk.l_len);
3575
3576		doio_upanic(U_RVAL);
3577		rval = -1;
3578	}
3579
3580	/*
3581	 * Unlock file region if necessary
3582	 */
3583
3584	if (got_lock) {
3585		if (lock_file_region(file, fd, F_UNLCK,
3586				     min_byte, (max_byte - min_byte + 1)) < 0) {
3587			alloc_mem(-1);
3588			exit(E_INTERNAL);
3589		}
3590	}
3591
3592	return (rval == -1) ? -1 : 0;
3593}
3594#endif /* sgi */
3595
3596/*
3597 *  fsync(2) and fdatasync(2)
3598 */
3599#ifndef CRAY
3600int do_sync(struct io_req *req)
3601{
3602	int fd, oflags;
3603	int rval;
3604	char *file;
3605
3606	/*
3607	 * Initialize common fields - assumes r_oflags, r_file, r_offset, and
3608	 * r_nbytes are at the same offset in the read_req and reada_req
3609	 * structures.
3610	 */
3611	file = req->r_data.io.r_file;
3612	oflags = req->r_data.io.r_oflags;
3613
3614	/*
3615	 * Get an open file descriptor
3616	 */
3617
3618	if ((fd = alloc_fd(file, oflags)) == -1)
3619		return -1;
3620
3621	rval = 0;
3622	switch (req->r_type) {
3623	case FSYNC2:
3624		rval = fsync(fd);
3625		break;
3626	case FDATASYNC:
3627		rval = fdatasync(fd);
3628		break;
3629	default:
3630		rval = -1;
3631	}
3632	return (rval == -1) ? -1 : 0;
3633}
3634#endif /* !CRAY */
3635
3636int
3637doio_pat_fill(char *addr, int mem_needed, char *Pattern, int Pattern_Length,
3638	      int shift)
3639{
3640	return pattern_fill(addr, mem_needed, Pattern, Pattern_Length, 0);
3641}
3642
3643char *doio_pat_check(char *buf, int offset, int length, char *pattern,
3644		     int pattern_length, int patshift)
3645{
3646	static char errbuf[4096];
3647	int nb, i, pattern_index;
3648	char *cp, *bufend, *ep;
3649	char actual[33], expected[33];
3650
3651	if (pattern_check(buf, length, pattern, pattern_length, patshift) != 0) {
3652		ep = errbuf;
3653		ep +=
3654		    sprintf(ep,
3655			    "Corrupt regions follow - unprintable chars are represented as '.'\n");
3656		ep +=
3657		    sprintf(ep,
3658			    "-----------------------------------------------------------------\n");
3659
3660		pattern_index = patshift % pattern_length;;
3661		cp = buf;
3662		bufend = buf + length;
3663
3664		while (cp < bufend) {
3665			if (*cp != pattern[pattern_index]) {
3666				nb = bufend - cp;
3667				if ((unsigned int)nb > sizeof(expected) - 1) {
3668					nb = sizeof(expected) - 1;
3669				}
3670
3671				ep +=
3672				    sprintf(ep,
3673					    "corrupt bytes starting at file offset %d\n",
3674					    offset + (int)(cp - buf));
3675
3676				/*
3677				 * Fill in the expected and actual patterns
3678				 */
3679				memset(expected, 0x00, sizeof(expected));
3680				memset(actual, 0x00, sizeof(actual));
3681
3682				for (i = 0; i < nb; i++) {
3683					expected[i] =
3684					    pattern[(pattern_index +
3685						     i) % pattern_length];
3686					if (!isprint(expected[i])) {
3687						expected[i] = '.';
3688					}
3689
3690					actual[i] = cp[i];
3691					if (!isprint(actual[i])) {
3692						actual[i] = '.';
3693					}
3694				}
3695
3696				ep +=
3697				    sprintf(ep,
3698					    "    1st %2d expected bytes:  %s\n",
3699					    nb, expected);
3700				ep +=
3701				    sprintf(ep,
3702					    "    1st %2d actual bytes:    %s\n",
3703					    nb, actual);
3704				fflush(stderr);
3705				return errbuf;
3706			} else {
3707				cp++;
3708				pattern_index++;
3709
3710				if (pattern_index == pattern_length) {
3711					pattern_index = 0;
3712				}
3713			}
3714		}
3715		return errbuf;
3716	}
3717
3718	return NULL;
3719}
3720
3721/*
3722 * Check the contents of a file beginning at offset, for length bytes.  It
3723 * is assumed that there is a string of pattern bytes in this area of the
3724 * file.  Use normal buffered reads to do the verification.
3725 *
3726 * If there is a data mismatch, write a detailed message into a static buffer
3727 * suitable for the caller to print.  Otherwise print NULL.
3728 *
3729 * The fsa flag is set to non-zero if the buffer should be read back through
3730 * the FSA (unicos/mk).  This implies the file will be opened
3731 * O_PARALLEL|O_RAW|O_WELLFORMED to do the validation.  We must do this because
3732 * FSA will not allow the file to be opened for buffered io if it was
3733 * previously opened for O_PARALLEL io.
3734 */
3735
3736char *check_file(char *file, int offset, int length, char *pattern,
3737		 int pattern_length, int patshift, int fsa)
3738{
3739	static char errbuf[4096];
3740	int fd, nb, flags;
3741	char *buf, *em, *ep;
3742#ifdef sgi
3743	struct fd_cache *fdc;
3744#endif
3745
3746	buf = Memptr;
3747
3748	if (V_opt) {
3749		flags = Validation_Flags | O_RDONLY;
3750	} else {
3751		flags = O_RDONLY;
3752		if (fsa) {
3753#ifdef CRAY
3754			flags |= O_PARALLEL | O_RAW | O_WELLFORMED;
3755#endif
3756		}
3757	}
3758
3759	if ((fd = alloc_fd(file, flags)) == -1) {
3760		sprintf(errbuf,
3761			"Could not open file %s with flags %#o (%s) for data comparison:  %s (%d)\n",
3762			file, flags, format_oflags(flags), SYSERR, errno);
3763		return errbuf;
3764	}
3765
3766	if (lseek(fd, offset, SEEK_SET) == -1) {
3767		sprintf(errbuf,
3768			"Could not lseek to offset %d in %s for verification:  %s (%d)\n",
3769			offset, file, SYSERR, errno);
3770		return errbuf;
3771	}
3772#ifdef sgi
3773	/* Irix: Guarantee a properly aligned address on Direct I/O */
3774	fdc = alloc_fdcache(file, flags);
3775	if ((flags & O_DIRECT) && ((long)buf % fdc->c_memalign != 0)) {
3776		buf += fdc->c_memalign - ((long)buf % fdc->c_memalign);
3777	}
3778#endif
3779
3780	if ((nb = read(fd, buf, length)) == -1) {
3781#ifdef sgi
3782		sprintf(errbuf,
3783			"Could not read %d bytes from %s for verification:  %s (%d)\n\tread(%d, 0x%lx, %d)\n\tbuf %% alignment(%d) = %ld\n",
3784			length, file, SYSERR, errno,
3785			fd, buf, length,
3786			fdc->c_memalign, (long)buf % fdc->c_memalign);
3787#else
3788		sprintf(errbuf,
3789			"Could not read %d bytes from %s for verification:  %s (%d)\n",
3790			length, file, SYSERR, errno);
3791
3792#endif
3793		return errbuf;
3794	}
3795
3796	if (nb != length) {
3797		sprintf(errbuf,
3798			"Read wrong # bytes from %s.  Expected %d, got %d\n",
3799			file, length, nb);
3800		return errbuf;
3801	}
3802
3803	if ((em =
3804	     (*Data_Check) (buf, offset, length, pattern, pattern_length,
3805			    patshift)) != NULL) {
3806		ep = errbuf;
3807		ep += sprintf(ep, "*** DATA COMPARISON ERROR ***\n");
3808		ep +=
3809		    sprintf(ep, "check_file(%s, %d, %d, %s, %d, %d) failed\n\n",
3810			    file, offset, length, pattern, pattern_length,
3811			    patshift);
3812		ep +=
3813		    sprintf(ep, "Comparison fd is %d, with open flags %#o\n",
3814			    fd, flags);
3815		strcpy(ep, em);
3816		return (errbuf);
3817	}
3818	return NULL;
3819}
3820
3821/*
3822 * Function to single-thread stdio output.
3823 */
3824
3825int doio_fprintf(FILE * stream, char *format, ...)
3826{
3827	static int pid = -1;
3828	char *date;
3829	int rval;
3830	struct flock flk;
3831	va_list arglist;
3832	struct timeval ts;
3833	gettimeofday(&ts, NULL);
3834	date = hms(ts.tv_sec);
3835
3836	if (pid == -1) {
3837		pid = getpid();
3838	}
3839
3840	flk.l_whence = flk.l_start = flk.l_len = 0;
3841	flk.l_type = F_WRLCK;
3842	fcntl(fileno(stream), F_SETLKW, &flk);
3843
3844	va_start(arglist, format);
3845	rval = fprintf(stream, "\n%s%s (%5d) %s\n", Prog, TagName, pid, date);
3846	rval += fprintf(stream, "---------------------\n");
3847	vfprintf(stream, format, arglist);
3848	va_end(arglist);
3849
3850	fflush(stream);
3851
3852	flk.l_type = F_UNLCK;
3853	fcntl(fileno(stream), F_SETLKW, &flk);
3854
3855	return rval;
3856}
3857
3858/*
3859 * Simple function for allocating core memory.  Uses Memsize and Memptr to
3860 * keep track of the current amount allocated.
3861 */
3862#ifndef CRAY
3863int alloc_mem(int nbytes)
3864{
3865	char *cp;
3866	void *addr;
3867	int me = 0, flags, key, shmid;
3868	static int mturn = 0;	/* which memory type to use */
3869	struct memalloc *M;
3870	char filename[255];
3871#ifdef __linux__
3872	struct shmid_ds shm_ds;
3873#endif
3874
3875#ifdef __linux__
3876	memset(&shm_ds, 0x00, sizeof(struct shmid_ds));
3877#endif
3878
3879	/* nbytes = -1 means "free all allocated memory" */
3880	if (nbytes == -1) {
3881
3882		for (me = 0; me < Nmemalloc; me++) {
3883			if (Memalloc[me].space == NULL)
3884				continue;
3885
3886			switch (Memalloc[me].memtype) {
3887			case MEM_DATA:
3888#ifdef sgi
3889				if (Memalloc[me].flags & MEMF_MPIN)
3890					munpin(Memalloc[me].space,
3891					       Memalloc[me].size);
3892#endif
3893				free(Memalloc[me].space);
3894				Memalloc[me].space = NULL;
3895				Memptr = NULL;
3896				Memsize = 0;
3897				break;
3898			case MEM_SHMEM:
3899#ifdef sgi
3900				if (Memalloc[me].flags & MEMF_MPIN)
3901					munpin(Memalloc[me].space,
3902					       Memalloc[me].size);
3903#endif
3904				shmdt(Memalloc[me].space);
3905				Memalloc[me].space = NULL;
3906#ifdef sgi
3907				shmctl(Memalloc[me].fd, IPC_RMID);
3908#else
3909				shmctl(Memalloc[me].fd, IPC_RMID, &shm_ds);
3910#endif
3911				break;
3912			case MEM_MMAP:
3913#ifdef sgi
3914				if (Memalloc[me].flags & MEMF_MPIN)
3915					munpin(Memalloc[me].space,
3916					       Memalloc[me].size);
3917#endif
3918				munmap(Memalloc[me].space, Memalloc[me].size);
3919				close(Memalloc[me].fd);
3920				if (Memalloc[me].flags & MEMF_FILE) {
3921					unlink(Memalloc[me].name);
3922				}
3923				Memalloc[me].space = NULL;
3924				break;
3925			default:
3926				doio_fprintf(stderr,
3927					     "alloc_mem: HELP! Unknown memory space type %d index %d\n",
3928					     Memalloc[me].memtype, me);
3929				break;
3930			}
3931		}
3932		return 0;
3933	}
3934
3935	/*
3936	 * Select a memory area (currently round-robbin)
3937	 */
3938
3939	if (mturn >= Nmemalloc)
3940		mturn = 0;
3941
3942	M = &Memalloc[mturn];
3943
3944	switch (M->memtype) {
3945	case MEM_DATA:
3946		if (nbytes > M->size) {
3947			if (M->space != NULL) {
3948#ifdef sgi
3949				if (M->flags & MEMF_MPIN)
3950					munpin(M->space, M->size);
3951#endif
3952				free(M->space);
3953			}
3954			M->space = NULL;
3955			M->size = 0;
3956		}
3957
3958		if (M->space == NULL) {
3959			if ((cp = malloc(nbytes)) == NULL) {
3960				doio_fprintf(stderr,
3961					     "malloc(%d) failed:  %s (%d)\n",
3962					     nbytes, SYSERR, errno);
3963				return -1;
3964			}
3965#ifdef sgi
3966			if (M->flags & MEMF_MPIN) {
3967				if (mpin(cp, nbytes) == -1) {
3968					doio_fprintf(stderr,
3969						     "mpin(0x%lx, %d) failed:  %s (%d)\n",
3970						     cp, nbytes, SYSERR, errno);
3971				}
3972			}
3973#endif
3974			M->space = (void *)cp;
3975			M->size = nbytes;
3976		}
3977		break;
3978
3979	case MEM_MMAP:
3980		if (nbytes > M->size) {
3981			if (M->space != NULL) {
3982#ifdef sgi
3983				if (M->flags & MEMF_MPIN)
3984					munpin(M->space, M->size);
3985#endif
3986				munmap(M->space, M->size);
3987				close(M->fd);
3988				if (M->flags & MEMF_FILE)
3989					unlink(M->name);
3990			}
3991			M->space = NULL;
3992			M->size = 0;
3993		}
3994
3995		if (M->space == NULL) {
3996			if (strchr(M->name, '%')) {
3997				sprintf(filename, M->name, getpid());
3998				M->name = strdup(filename);
3999			}
4000
4001			if ((M->fd =
4002			     open(M->name, O_CREAT | O_RDWR, 0666)) == -1) {
4003				doio_fprintf(stderr,
4004					     "alloc_mmap: error %d (%s) opening '%s'\n",
4005					     errno, SYSERR, M->name);
4006				return (-1);
4007			}
4008
4009			addr = NULL;
4010			flags = 0;
4011			M->size = nbytes * 4;
4012
4013			/* bias addr if MEMF_ADDR | MEMF_FIXADDR */
4014			/* >>> how to pick a memory address? */
4015
4016			/* bias flags on MEMF_PRIVATE etc */
4017			if (M->flags & MEMF_PRIVATE)
4018				flags |= MAP_PRIVATE;
4019#ifdef sgi
4020			if (M->flags & MEMF_LOCAL)
4021				flags |= MAP_LOCAL;
4022			if (M->flags & MEMF_AUTORESRV)
4023				flags |= MAP_AUTORESRV;
4024			if (M->flags & MEMF_AUTOGROW)
4025				flags |= MAP_AUTOGROW;
4026#endif
4027			if (M->flags & MEMF_SHARED)
4028				flags |= MAP_SHARED;
4029
4030/*printf("alloc_mem, about to mmap, fd=%d, name=(%s)\n", M->fd, M->name);*/
4031			if ((M->space = mmap(addr, M->size,
4032					     PROT_READ | PROT_WRITE,
4033					     flags, M->fd, 0))
4034			    == MAP_FAILED) {
4035				doio_fprintf(stderr,
4036					     "alloc_mem: mmap error. errno %d (%s)\n\tmmap(addr 0x%x, size %d, read|write 0x%x, mmap flags 0x%x [%#o], fd %d, 0)\n\tfile %s\n",
4037					     errno, SYSERR, addr, M->size,
4038					     PROT_READ | PROT_WRITE, flags,
4039					     M->flags, M->fd, M->name);
4040				doio_fprintf(stderr, "\t%s%s%s%s%s",
4041					     (flags & MAP_PRIVATE) ? "private "
4042					     : "",
4043#ifdef sgi
4044					     (flags & MAP_LOCAL) ? "local " :
4045					     "",
4046					     (flags & MAP_AUTORESRV) ?
4047					     "autoresrv " : "",
4048					     (flags & MAP_AUTOGROW) ?
4049					     "autogrow " : "",
4050#endif
4051					     (flags & MAP_SHARED) ? "shared" :
4052					     "");
4053				return (-1);
4054			}
4055		}
4056		break;
4057
4058	case MEM_SHMEM:
4059		if (nbytes > M->size) {
4060			if (M->space != NULL) {
4061#ifdef sgi
4062				if (M->flags & MEMF_MPIN)
4063					munpin(M->space, M->size);
4064#endif
4065				shmdt(M->space);
4066#ifdef sgi
4067				shmctl(M->fd, IPC_RMID);
4068#else
4069				shmctl(M->fd, IPC_RMID, &shm_ds);
4070#endif
4071			}
4072			M->space = NULL;
4073			M->size = 0;
4074		}
4075
4076		if (M->space == NULL) {
4077			if (!strcmp(M->name, "private")) {
4078				key = IPC_PRIVATE;
4079			} else {
4080				sscanf(M->name, "%i", &key);
4081			}
4082
4083			M->size = M->nblks ? M->nblks * 512 : nbytes;
4084
4085			if (nbytes > M->size) {
4086#ifdef DEBUG
4087				doio_fprintf(stderr,
4088					     "MEM_SHMEM: nblks(%d) too small:  nbytes=%d  Msize=%d, skipping this req.\n",
4089					     M->nblks, nbytes, M->size);
4090#endif
4091				return SKIP_REQ;
4092			}
4093
4094			shmid = shmget(key, M->size, IPC_CREAT | 0666);
4095			if (shmid == -1) {
4096				doio_fprintf(stderr,
4097					     "shmget(0x%x, %d, CREAT) failed: %s (%d)\n",
4098					     key, M->size, SYSERR, errno);
4099				return (-1);
4100			}
4101			M->fd = shmid;
4102			M->space = shmat(shmid, NULL, SHM_RND);
4103			if (M->space == (void *)-1) {
4104				doio_fprintf(stderr,
4105					     "shmat(0x%x, NULL, SHM_RND) failed: %s (%d)\n",
4106					     shmid, SYSERR, errno);
4107				return (-1);
4108			}
4109#ifdef sgi
4110			if (M->flags & MEMF_MPIN) {
4111				if (mpin(M->space, M->size) == -1) {
4112					doio_fprintf(stderr,
4113						     "mpin(0x%lx, %d) failed:  %s (%d)\n",
4114						     M->space, M->size, SYSERR,
4115						     errno);
4116				}
4117			}
4118#endif
4119		}
4120		break;
4121
4122	default:
4123		doio_fprintf(stderr,
4124			     "alloc_mem: HELP! Unknown memory space type %d index %d\n",
4125			     Memalloc[me].memtype, mturn);
4126		break;
4127	}
4128
4129	Memptr = M->space;
4130	Memsize = M->size;
4131
4132	mturn++;
4133	return 0;
4134}
4135#else /* CRAY */
4136int alloc_mem(int nbytes)
4137{
4138	char *cp;
4139	int ip;
4140	static char *malloc_space;
4141
4142	/*
4143	 * The "unicos" version of this did some stuff with sbrk;
4144	 * this caused problems with async I/O on irix, and now appears
4145	 * to be causing problems with FSA I/O on unicos/mk.
4146	 */
4147#ifdef NOTDEF
4148	if (nbytes > Memsize) {
4149		if ((cp = (char *)sbrk(nbytes - Memsize)) == (char *)-1) {
4150			doio_fprintf(stderr, "sbrk(%d) failed:  %s (%d)\n",
4151				     nbytes - Memsize, SYSERR, errno);
4152			return -1;
4153		}
4154
4155		if (Memsize == 0)
4156			Memptr = cp;
4157		Memsize += nbytes - Memsize;
4158	}
4159#else
4160
4161	/* nbytes = -1 means "free all allocated memory" */
4162	if (nbytes == -1) {
4163		free(malloc_space);
4164		Memptr = NULL;
4165		Memsize = 0;
4166		return 0;
4167	}
4168
4169	if (nbytes > Memsize) {
4170		if (Memsize != 0)
4171			free(malloc_space);
4172
4173		if ((cp = malloc_space = malloc(nbytes)) == NULL) {
4174			doio_fprintf(stderr, "malloc(%d) failed:  %s (%d)\n",
4175				     nbytes, SYSERR, errno);
4176			return -1;
4177		}
4178#ifdef _CRAYT3E
4179		/* T3E requires memory to be aligned on 0x40 word boundaries */
4180		ip = (int)cp;
4181		if (ip & 0x3F != 0) {
4182			doio_fprintf(stderr,
4183				     "malloc(%d) = 0x%x(0x%x) not aligned by 0x%x\n",
4184				     nbytes, cp, ip, ip & 0x3f);
4185
4186			free(cp);
4187			if ((cp = malloc_space = malloc(nbytes + 0x40)) == NULL) {
4188				doio_fprintf(stderr,
4189					     "malloc(%d) failed:  %s (%d)\n",
4190					     nbytes, SYSERR, errno);
4191				return -1;
4192			}
4193			ip = (int)cp;
4194			cp += (0x40 - (ip & 0x3F));
4195		}
4196#endif /* _CRAYT3E */
4197		Memptr = cp;
4198		Memsize = nbytes;
4199	}
4200#endif /* NOTDEF */
4201	return 0;
4202}
4203#endif /* CRAY */
4204
4205/*
4206 * Simple function for allocating sds space.  Uses Sdssize and Sdsptr to
4207 * keep track of location and size of currently allocated chunk.
4208 */
4209
4210#ifdef _CRAY1
4211
4212int alloc_sds(int nbytes)
4213{
4214	int nblks;
4215
4216	if (nbytes > Sdssize) {
4217		if ((nblks = ssbreak(btoc(nbytes - Sdssize))) == -1) {
4218			doio_fprintf(stderr, "ssbreak(%d) failed:  %s (%d)\n",
4219				     btoc(nbytes - Sdssize), SYSERR, errno);
4220			return -1;
4221		}
4222
4223		Sdssize = ctob(nblks);
4224		Sdsptr = 0;
4225	}
4226
4227	return 0;
4228}
4229
4230#else
4231
4232#ifdef CRAY
4233
4234int alloc_sds(int nbytes)
4235{
4236	doio_fprintf(stderr,
4237		     "Internal Error - alloc_sds() called on a CRAY2 system\n");
4238	alloc_mem(-1);
4239	exit(E_INTERNAL);
4240}
4241
4242#endif
4243
4244#endif /* _CRAY1 */
4245
4246/*
4247 * Function to maintain a file descriptor cache, so that doio does not have
4248 * to do so many open() and close() calls.  Descriptors are stored in the
4249 * cache by file name, and open flags.  Each entry also has a _rtc value
4250 * associated with it which is used in aging.  If doio cannot open a file
4251 * because it already has too many open (ie. system limit hit) it will close
4252 * the one in the cache that has the oldest _rtc value.
4253 *
4254 * If alloc_fd() is called with a file of NULL, it will close all descriptors
4255 * in the cache, and free the memory in the cache.
4256 */
4257
4258int alloc_fd(char *file, int oflags)
4259{
4260	struct fd_cache *fdc;
4261	struct fd_cache *alloc_fdcache(char *file, int oflags);
4262
4263	fdc = alloc_fdcache(file, oflags);
4264	if (fdc != NULL)
4265		return (fdc->c_fd);
4266	else
4267		return (-1);
4268}
4269
4270struct fd_cache *alloc_fdcache(char *file, int oflags)
4271{
4272	int fd;
4273	struct fd_cache *free_slot, *oldest_slot, *cp;
4274	static int cache_size = 0;
4275	static struct fd_cache *cache = NULL;
4276#ifdef sgi
4277	struct dioattr finfo;
4278#endif
4279
4280	/*
4281	 * If file is NULL, it means to free up the fd cache.
4282	 */
4283
4284	if (file == NULL && cache != NULL) {
4285		for (cp = cache; cp < &cache[cache_size]; cp++) {
4286			if (cp->c_fd != -1) {
4287				close(cp->c_fd);
4288			}
4289#ifndef CRAY
4290			if (cp->c_memaddr != NULL) {
4291				munmap(cp->c_memaddr, cp->c_memlen);
4292			}
4293#endif
4294		}
4295
4296		free(cache);
4297		cache = NULL;
4298		cache_size = 0;
4299		return 0;
4300	}
4301
4302	free_slot = NULL;
4303	oldest_slot = NULL;
4304
4305	/*
4306	 * Look for a fd in the cache.  If one is found, return it directly.
4307	 * Otherwise, when this loop exits, oldest_slot will point to the
4308	 * oldest fd slot in the cache, and free_slot will point to an
4309	 * unoccupied slot if there are any.
4310	 */
4311
4312	for (cp = cache; cp != NULL && cp < &cache[cache_size]; cp++) {
4313		if (cp->c_fd != -1 &&
4314		    cp->c_oflags == oflags && strcmp(cp->c_file, file) == 0) {
4315#ifdef CRAY
4316			cp->c_rtc = _rtc();
4317#else
4318			cp->c_rtc = Reqno;
4319#endif
4320			return cp;
4321		}
4322
4323		if (cp->c_fd == -1) {
4324			if (free_slot == NULL) {
4325				free_slot = cp;
4326			}
4327		} else {
4328			if (oldest_slot == NULL ||
4329			    cp->c_rtc < oldest_slot->c_rtc) {
4330				oldest_slot = cp;
4331			}
4332		}
4333	}
4334
4335	/*
4336	 * No matching file/oflags pair was found in the cache.  Attempt to
4337	 * open a new fd.
4338	 */
4339
4340	if ((fd = open(file, oflags, 0666)) < 0) {
4341		if (errno != EMFILE) {
4342			doio_fprintf(stderr,
4343				     "Could not open file %s with flags %#o (%s): %s (%d)\n",
4344				     file, oflags, format_oflags(oflags),
4345				     SYSERR, errno);
4346			alloc_mem(-1);
4347			exit(E_SETUP);
4348		}
4349
4350		/*
4351		 * If we get here, we have as many open fd's as we can have.
4352		 * Close the oldest one in the cache (pointed to by
4353		 * oldest_slot), and attempt to re-open.
4354		 */
4355
4356		close(oldest_slot->c_fd);
4357		oldest_slot->c_fd = -1;
4358		free_slot = oldest_slot;
4359
4360		if ((fd = open(file, oflags, 0666)) < 0) {
4361			doio_fprintf(stderr,
4362				     "Could not open file %s with flags %#o (%s):  %s (%d)\n",
4363				     file, oflags, format_oflags(oflags),
4364				     SYSERR, errno);
4365			alloc_mem(-1);
4366			exit(E_SETUP);
4367		}
4368	}
4369
4370/*printf("alloc_fd: new file %s flags %#o fd %d\n", file, oflags, fd);*/
4371
4372	/*
4373	 * If we get here, fd is our open descriptor.  If free_slot is NULL,
4374	 * we need to grow the cache, otherwise free_slot is the slot that
4375	 * should hold the fd info.
4376	 */
4377
4378	if (free_slot == NULL) {
4379		cache =
4380		    (struct fd_cache *)realloc(cache,
4381					       sizeof(struct fd_cache) *
4382					       (FD_ALLOC_INCR + cache_size));
4383		if (cache == NULL) {
4384			doio_fprintf(stderr,
4385				     "Could not malloc() space for fd chace");
4386			alloc_mem(-1);
4387			exit(E_SETUP);
4388		}
4389
4390		cache_size += FD_ALLOC_INCR;
4391
4392		for (cp = &cache[cache_size - FD_ALLOC_INCR];
4393		     cp < &cache[cache_size]; cp++) {
4394			cp->c_fd = -1;
4395		}
4396
4397		free_slot = &cache[cache_size - FD_ALLOC_INCR];
4398	}
4399
4400	/*
4401	 * finally, fill in the cache slot info
4402	 */
4403
4404	free_slot->c_fd = fd;
4405	free_slot->c_oflags = oflags;
4406	strcpy(free_slot->c_file, file);
4407#ifdef CRAY
4408	free_slot->c_rtc = _rtc();
4409#else
4410	free_slot->c_rtc = Reqno;
4411#endif
4412
4413#ifdef sgi
4414	if (oflags & O_DIRECT) {
4415		if (fcntl(fd, F_DIOINFO, &finfo) == -1) {
4416			finfo.d_mem = 1;
4417			finfo.d_miniosz = 1;
4418			finfo.d_maxiosz = 1;
4419		}
4420	} else {
4421		finfo.d_mem = 1;
4422		finfo.d_miniosz = 1;
4423		finfo.d_maxiosz = 1;
4424	}
4425
4426	free_slot->c_memalign = finfo.d_mem;
4427	free_slot->c_miniosz = finfo.d_miniosz;
4428	free_slot->c_maxiosz = finfo.d_maxiosz;
4429#endif /* sgi */
4430#ifndef CRAY
4431	free_slot->c_memaddr = NULL;
4432	free_slot->c_memlen = 0;
4433#endif
4434
4435	return free_slot;
4436}
4437
4438/*
4439 *
4440 *			Signal Handling Section
4441 *
4442 *
4443 */
4444
4445#ifdef sgi
4446/*
4447 * "caller-id" for signals
4448 */
4449void signal_info(int sig, siginfo_t * info, void *v)
4450{
4451	int haveit = 0;
4452
4453	if (info != NULL) {
4454		switch (info->si_code) {
4455		case SI_USER:
4456			doio_fprintf(stderr,
4457				     "signal_info: si_signo %d si_errno %d si_code SI_USER pid %d uid %d\n",
4458				     info->si_signo, info->si_errno,
4459				     info->si_pid, info->si_uid);
4460			haveit = 1;
4461			break;
4462
4463		case SI_QUEUE:
4464			doio_fprintf(stderr,
4465				     "signal_info  si_signo %d si_code = SI_QUEUE\n",
4466				     info->si_signo);
4467			haveit = 1;
4468			break;
4469		}
4470
4471		if (!haveit) {
4472			if ((info->si_signo == SIGSEGV) ||
4473			    (info->si_signo == SIGBUS)) {
4474				doio_fprintf(stderr,
4475					     "signal_info  si_signo %d si_errno %d si_code = %d  si_addr=%p  active_mmap_rw=%d havesigint=%d\n",
4476					     info->si_signo, info->si_errno,
4477					     info->si_code, info->si_addr,
4478					     active_mmap_rw, havesigint);
4479				haveit = 1;
4480			}
4481		}
4482
4483		if (!haveit) {
4484			doio_fprintf(stderr,
4485				     "signal_info: si_signo %d si_errno %d unknown code %d\n",
4486				     info->si_signo, info->si_errno,
4487				     info->si_code);
4488		}
4489	} else {
4490		doio_fprintf(stderr, "signal_info: sig %d\n", sig);
4491	}
4492}
4493
4494void cleanup_handler(int sig, siginfo_t * info, void *v)
4495{
4496	havesigint = 1;		/* in case there's a followup signal */
4497	/*signal_info(sig, info, v); *//* be quiet on "normal" kill */
4498	alloc_mem(-1);
4499	exit(0);
4500}
4501
4502void die_handler(int sig, siginfo_t * info, void *v)
4503{
4504	doio_fprintf(stderr, "terminating on signal %d\n", sig);
4505	signal_info(sig, info, v);
4506	alloc_mem(-1);
4507	exit(1);
4508}
4509
4510void sigbus_handler(int sig, siginfo_t * info, void *v)
4511{
4512	/* While we are doing a memcpy to/from an mmapped region we can
4513	   get a SIGBUS for a variety of reasons--and not all of them
4514	   should be considered failures.
4515
4516	   Under normal conditions if we get a SIGINT it means we've been
4517	   told to shutdown.  However, if we're currently doing the above-
4518	   mentioned memcopy then the kernel will follow that SIGINT with
4519	   a SIGBUS.  We can guess that we're in this situation by seeing
4520	   that the si_errno field in the siginfo structure has EINTR as
4521	   an errno.  (We might make the guess stronger by looking at the
4522	   si_addr field to see that it's not faulting off the end of the
4523	   mmapped region, but it seems that in such a case havesigint
4524	   would not have been set so maybe that doesn't make the guess
4525	   stronger.)
4526	 */
4527
4528	if (active_mmap_rw && havesigint && (info->si_errno == EINTR)) {
4529		cleanup_handler(sig, info, v);
4530	} else {
4531		die_handler(sig, info, v);
4532	}
4533}
4534#else
4535
4536void cleanup_handler(int sig)
4537{
4538	havesigint = 1;		/* in case there's a followup signal */
4539	alloc_mem(-1);
4540	exit(0);
4541}
4542
4543void die_handler(int sig)
4544{
4545	doio_fprintf(stderr, "terminating on signal %d\n", sig);
4546	alloc_mem(-1);
4547	exit(1);
4548}
4549
4550#ifndef CRAY
4551void sigbus_handler(int sig)
4552{
4553	/* See sigbus_handler() in the 'ifdef sgi' case for details.  Here,
4554	   we don't have the siginfo stuff so the guess is weaker but we'll
4555	   do it anyway.
4556	 */
4557
4558	if (active_mmap_rw && havesigint)
4559		cleanup_handler(sig);
4560	else
4561		die_handler(sig);
4562}
4563#endif /* !CRAY */
4564#endif /* sgi */
4565
4566void noop_handler(int sig)
4567{
4568	return;
4569}
4570
4571/*
4572 * SIGINT handler for the parent (original doio) process.  It simply sends
4573 * a SIGINT to all of the doio children.  Since they're all in the same
4574 * pgrp, this can be done with a single kill().
4575 */
4576
4577void sigint_handler(int sig)
4578{
4579	int i;
4580
4581	for (i = 0; i < Nchildren; i++) {
4582		if (Children[i] != -1) {
4583			kill(Children[i], SIGINT);
4584		}
4585	}
4586}
4587
4588/*
4589 * Signal handler used to inform a process when async io completes.  Referenced
4590 * in do_read() and do_write().  Note that the signal handler is not
4591 * re-registered.
4592 */
4593
4594void aio_handler(int sig)
4595{
4596	unsigned int i;
4597	struct aio_info *aiop;
4598
4599	for (i = 0; i < sizeof(Aio_Info) / sizeof(Aio_Info[0]); i++) {
4600		aiop = &Aio_Info[i];
4601
4602		if (aiop->strategy == A_SIGNAL && aiop->sig == sig) {
4603			aiop->signalled++;
4604
4605			if (aio_done(aiop)) {
4606				aiop->done++;
4607			}
4608		}
4609	}
4610}
4611
4612/*
4613 * dump info on all open aio slots
4614 */
4615void dump_aio(void)
4616{
4617	unsigned int i, count;
4618
4619	count = 0;
4620	for (i = 0; i < sizeof(Aio_Info) / sizeof(Aio_Info[0]); i++) {
4621		if (Aio_Info[i].busy) {
4622			count++;
4623			fprintf(stderr,
4624				"Aio_Info[%03d] id=%d fd=%d signal=%d signaled=%d\n",
4625				i, Aio_Info[i].id,
4626				Aio_Info[i].fd,
4627				Aio_Info[i].sig, Aio_Info[i].signalled);
4628			fprintf(stderr, "\tstrategy=%s\n",
4629				format_strat(Aio_Info[i].strategy));
4630		}
4631	}
4632	fprintf(stderr, "%d active async i/os\n", count);
4633}
4634
4635#ifdef sgi
4636/*
4637 * Signal handler called as a callback, not as a signal.
4638 * 'val' is the value from sigev_value and is assumed to be the
4639 * Aio_Info[] index.
4640 */
4641void cb_handler(sigval_t val)
4642{
4643	struct aio_info *aiop;
4644
4645/*printf("cb_handler requesting slot %d\n", val.sival_int);*/
4646	aiop = aio_slot(val.sival_int);
4647/*printf("cb_handler, aiop=%p\n", aiop);*/
4648
4649/*printf("%d in cb_handler\n", getpid() );*/
4650	if (aiop->strategy == A_CALLBACK) {
4651		aiop->signalled++;
4652
4653		if (aio_done(aiop)) {
4654			aiop->done++;
4655		}
4656	}
4657}
4658#endif
4659
4660struct aio_info *aio_slot(int aio_id)
4661{
4662	unsigned int i;
4663	static int id = 1;
4664	struct aio_info *aiop;
4665
4666	aiop = NULL;
4667
4668	for (i = 0; i < sizeof(Aio_Info) / sizeof(Aio_Info[0]); i++) {
4669		if (aio_id == -1) {
4670			if (!Aio_Info[i].busy) {
4671				aiop = &Aio_Info[i];
4672				aiop->busy = 1;
4673				aiop->id = id++;
4674				break;
4675			}
4676		} else {
4677			if (Aio_Info[i].busy && Aio_Info[i].id == aio_id) {
4678				aiop = &Aio_Info[i];
4679				break;
4680			}
4681		}
4682	}
4683
4684	if (aiop == NULL) {
4685		doio_fprintf(stderr, "aio_slot(%d) not found.  Request %d\n",
4686			     aio_id, Reqno);
4687		dump_aio();
4688		alloc_mem(-1);
4689		exit(E_INTERNAL);
4690	}
4691
4692	return aiop;
4693}
4694
4695int aio_register(int fd, int strategy, int sig)
4696{
4697	struct aio_info *aiop;
4698	struct sigaction sa;
4699
4700	aiop = aio_slot(-1);
4701
4702	aiop->fd = fd;
4703	aiop->strategy = strategy;
4704	aiop->done = 0;
4705#ifdef CRAY
4706	memset((char *)&aiop->iosw, 0x00, sizeof(aiop->iosw));
4707#endif
4708
4709	if (strategy == A_SIGNAL) {
4710		aiop->sig = sig;
4711		aiop->signalled = 0;
4712
4713		sa.sa_handler = aio_handler;
4714		sa.sa_flags = 0;
4715		sigemptyset(&sa.sa_mask);
4716
4717		sigaction(sig, &sa, &aiop->osa);
4718	} else {
4719		aiop->sig = -1;
4720		aiop->signalled = 0;
4721	}
4722
4723	return aiop->id;
4724}
4725
4726int aio_unregister(int aio_id)
4727{
4728	struct aio_info *aiop;
4729
4730	aiop = aio_slot(aio_id);
4731
4732	if (aiop->strategy == A_SIGNAL) {
4733		sigaction(aiop->sig, &aiop->osa, NULL);
4734	}
4735
4736	aiop->busy = 0;
4737	return 0;
4738}
4739
4740#ifndef __linux__
4741int aio_wait(int aio_id)
4742{
4743#ifdef RECALL_SIZEOF
4744	long mask[RECALL_SIZEOF];
4745#endif
4746	sigset_t sigset;
4747	struct aio_info *aiop;
4748#ifdef CRAY
4749	struct iosw *ioswlist[1];
4750#endif
4751#ifdef sgi
4752	const aiocb_t *aioary[1];
4753#endif
4754	int r, cnt;
4755
4756	aiop = aio_slot(aio_id);
4757/*printf("%d aiop B =%p\n", getpid(), aiop);*/
4758
4759	switch (aiop->strategy) {
4760	case A_POLL:
4761		while (!aio_done(aiop)) ;
4762		break;
4763
4764	case A_SIGNAL:
4765		sigemptyset(&sigset);
4766		sighold(aiop->sig);
4767
4768		while (!aiop->signalled || !aiop->done) {
4769			sigsuspend(&sigset);
4770			sighold(aiop->sig);
4771		}
4772		break;
4773
4774#ifdef CRAY
4775	case A_RECALL:
4776		ioswlist[0] = &aiop->iosw;
4777		if (recall(aiop->fd, 1, ioswlist) < 0) {
4778			doio_fprintf(stderr, "recall() failed:  %s (%d)\n",
4779				     SYSERR, errno);
4780			exit(E_SETUP);
4781		}
4782		break;
4783
4784#ifdef RECALL_SIZEOF
4785
4786	case A_RECALLA:
4787		RECALL_INIT(mask);
4788		RECALL_SET(mask, aiop->fd);
4789		if (recalla(mask) < 0) {
4790			doio_fprintf(stderr, "recalla() failed:  %s (%d)\n",
4791				     SYSERR, errno);
4792			exit(E_SETUP);
4793		}
4794
4795		RECALL_CLR(mask, aiop->fd);
4796		break;
4797#endif
4798
4799	case A_RECALLS:
4800		ioswlist[0] = &aiop->iosw;
4801		if (recalls(1, ioswlist) < 0) {
4802			doio_fprintf(stderr, "recalls failed:  %s (%d)\n",
4803				     SYSERR, errno);
4804			exit(E_SETUP);
4805		}
4806		break;
4807#endif /* CRAY */
4808
4809#ifdef sgi
4810	case A_CALLBACK:
4811		aioary[0] = &aiop->aiocb;
4812		cnt = 0;
4813		do {
4814			r = aio_suspend(aioary, 1, NULL);
4815			if (r == -1) {
4816				doio_fprintf(stderr,
4817					     "aio_suspend failed: %s (%d)\n",
4818					     SYSERR, errno);
4819				exit(E_SETUP);
4820			}
4821			cnt++;
4822		} while (aiop->done == 0);
4823
4824#if 0
4825		/*
4826		 * after having this set for a while, I've decided that
4827		 * it's too noisy
4828		 */
4829		if (cnt > 1)
4830			doio_fprintf(stderr,
4831				     "aio_wait: callback wait took %d tries\n",
4832				     cnt);
4833#endif
4834
4835		/*
4836		 * Note: cb_handler already calls aio_done
4837		 */
4838		break;
4839
4840	case A_SUSPEND:
4841		aioary[0] = &aiop->aiocb;
4842		r = aio_suspend(aioary, 1, NULL);
4843		if (r == -1) {
4844			doio_fprintf(stderr, "aio_suspend failed: %s (%d)\n",
4845				     SYSERR, errno);
4846			exit(E_SETUP);
4847		}
4848
4849		aio_done(aiop);
4850		break;
4851#endif
4852	}
4853
4854/*printf("aio_wait: errno %d return %d\n", aiop->aio_errno, aiop->aio_ret);*/
4855
4856	return 0;
4857}
4858#endif /* !linux */
4859
4860/*
4861 * Format specified time into HH:MM:SS format.  t is the time to format
4862 * in seconds (as returned from time(2)).
4863 */
4864
4865char *hms(time_t t)
4866{
4867	static char ascii_time[9];
4868	struct tm *ltime;
4869
4870	ltime = localtime(&t);
4871	strftime(ascii_time, sizeof(ascii_time), "%H:%M:%S", ltime);
4872
4873	return ascii_time;
4874}
4875
4876/*
4877 * Simple routine to check if an async io request has completed.
4878 */
4879
4880int aio_done(struct aio_info *ainfo)
4881{
4882#ifdef CRAY
4883	return ainfo->iosw.sw_flag;
4884#endif
4885
4886#ifdef sgi
4887	if ((ainfo->aio_errno = aio_error(&ainfo->aiocb)) == -1) {
4888		doio_fprintf(stderr, "aio_done: aio_error failed: %s (%d)\n",
4889			     SYSERR, errno);
4890		exit(E_SETUP);
4891	}
4892	/*printf("%d aio_done aio_errno=%d\n", getpid(), ainfo->aio_errno); */
4893	if (ainfo->aio_errno != EINPROGRESS) {
4894		if ((ainfo->aio_ret = aio_return(&ainfo->aiocb)) == -1) {
4895			doio_fprintf(stderr,
4896				     "aio_done: aio_return failed: %s (%d)\n",
4897				     SYSERR, errno);
4898			exit(E_SETUP);
4899		}
4900	}
4901
4902	return (ainfo->aio_errno != EINPROGRESS);
4903#else
4904	return -1;		/* invalid */
4905#endif
4906}
4907
4908/*
4909 * Routine to handle upanic() - it first attempts to set the panic flag.  If
4910 * the flag cannot be set, an error message is issued.  A call to upanic
4911 * with PA_PANIC is then done unconditionally, in case the panic flag was set
4912 * from outside the program (as with the panic(8) program).
4913 *
4914 * Note - we only execute the upanic code if -U was used, and the passed in
4915 * mask is set in the Upanic_Conditions bitmask.
4916 */
4917
4918void doio_upanic(int mask)
4919{
4920	if (U_opt == 0 || (mask & Upanic_Conditions) == 0) {
4921		return;
4922	}
4923#ifdef CRAY
4924	if (upanic(PA_SET) < 0) {
4925		doio_fprintf(stderr,
4926			     "WARNING - Could not set the panic flag - upanic(PA_SET) failed:  %s (%d)\n",
4927			     SYSERR, errno);
4928	}
4929
4930	upanic(PA_PANIC);
4931#endif
4932#ifdef sgi
4933	syssgi(1005);		/* syssgi test panic - DEBUG kernels only */
4934#endif
4935	doio_fprintf(stderr, "WARNING - upanic() failed\n");
4936}
4937
4938/*
4939 * Parse cmdline options/arguments and set appropriate global variables.
4940 * If the cmdline is valid, return 0 to caller.  Otherwise exit with a status
4941 * of 1.
4942 */
4943
4944int parse_cmdline(int argc, char **argv, char *opts)
4945{
4946	int c;
4947	char cc, *cp = NULL, *tok = NULL;
4948	extern int opterr;
4949	extern int optind;
4950	extern char *optarg;
4951	struct smap *s;
4952	char *memargs[NMEMALLOC];
4953	int nmemargs, ma;
4954
4955	if (*argv[0] == '-') {
4956		argv[0]++;
4957		Execd = 1;
4958	}
4959
4960	if ((Prog = strrchr(argv[0], '/')) == NULL) {
4961		Prog = argv[0];
4962	} else {
4963		Prog++;
4964	}
4965
4966	opterr = 0;
4967	while ((c = getopt(argc, argv, opts)) != EOF) {
4968		switch ((char)c) {
4969		case 'a':
4970			a_opt++;
4971			break;
4972
4973		case 'C':
4974			C_opt++;
4975			for (s = checkmap; s->string != NULL; s++)
4976				if (!strcmp(s->string, optarg))
4977					break;
4978			if (s->string == NULL && tok != NULL) {
4979				fprintf(stderr,
4980					"%s%s:  Illegal -C arg (%s).  Must be one of: ",
4981					Prog, TagName, tok);
4982
4983				for (s = checkmap; s->string != NULL; s++)
4984					fprintf(stderr, "%s ", s->string);
4985				fprintf(stderr, "\n");
4986				exit(1);
4987			}
4988
4989			switch (s->value) {
4990			case C_DEFAULT:
4991				Data_Fill = doio_pat_fill;
4992				Data_Check = doio_pat_check;
4993				break;
4994			default:
4995				fprintf(stderr,
4996					"%s%s:  Unrecognised -C arg '%s' %d",
4997					Prog, TagName, s->string, s->value);
4998				exit(1);
4999			}
5000			break;
5001
5002		case 'd':	/* delay between i/o ops */
5003			parse_delay(optarg);
5004			break;
5005
5006		case 'e':
5007			if (Npes > 1 && Nprocs > 1) {
5008				fprintf(stderr,
5009					"%s%s:  Warning - Program is a multi-pe application - exec option is ignored.\n",
5010					Prog, TagName);
5011			}
5012			e_opt++;
5013			break;
5014
5015		case 'h':
5016			help(stdout);
5017			exit(0);
5018			break;
5019
5020		case 'k':
5021			k_opt++;
5022			break;
5023
5024		case 'm':
5025			Message_Interval = strtol(optarg, &cp, 10);
5026			if (*cp != '\0' || Message_Interval < 0) {
5027				fprintf(stderr,
5028					"%s%s:  Illegal -m arg (%s):  Must be an integer >= 0\n",
5029					Prog, TagName, optarg);
5030				exit(1);
5031			}
5032			m_opt++;
5033			break;
5034
5035		case 'M':	/* memory allocation types */
5036#ifndef CRAY
5037			nmemargs = string_to_tokens(optarg, memargs, 32, ",");
5038			for (ma = 0; ma < nmemargs; ma++) {
5039				parse_memalloc(memargs[ma]);
5040			}
5041			/*dump_memalloc(); */
5042#else
5043			fprintf(stderr,
5044				"%s%s: Error: -M isn't supported on this platform\n",
5045				Prog, TagName);
5046			exit(1);
5047#endif
5048			M_opt++;
5049			break;
5050
5051		case 'N':
5052			sprintf(TagName, "(%.39s)", optarg);
5053			break;
5054
5055		case 'n':
5056			Nprocs = strtol(optarg, &cp, 10);
5057			if (*cp != '\0' || Nprocs < 1) {
5058				fprintf(stderr,
5059					"%s%s:  Illegal -n arg (%s):  Must be integer > 0\n",
5060					Prog, TagName, optarg);
5061				exit(E_USAGE);
5062			}
5063
5064			if (Npes > 1 && Nprocs > 1) {
5065				fprintf(stderr,
5066					"%s%s:  Program has been built as a multi-pe app.  -n1 is the only nprocs value allowed\n",
5067					Prog, TagName);
5068				exit(E_SETUP);
5069			}
5070			n_opt++;
5071			break;
5072
5073		case 'r':
5074			Release_Interval = strtol(optarg, &cp, 10);
5075			if (*cp != '\0' || Release_Interval < 0) {
5076				fprintf(stderr,
5077					"%s%s:  Illegal -r arg (%s):  Must be integer >= 0\n",
5078					Prog, TagName, optarg);
5079				exit(E_USAGE);
5080			}
5081
5082			r_opt++;
5083			break;
5084
5085		case 'w':
5086			Write_Log = optarg;
5087			w_opt++;
5088			break;
5089
5090		case 'v':
5091			v_opt++;
5092			break;
5093
5094		case 'V':
5095			if (strcasecmp(optarg, "sync") == 0) {
5096				Validation_Flags = O_SYNC;
5097			} else if (strcasecmp(optarg, "buffered") == 0) {
5098				Validation_Flags = 0;
5099#ifdef CRAY
5100			} else if (strcasecmp(optarg, "parallel") == 0) {
5101				Validation_Flags = O_PARALLEL;
5102			} else if (strcasecmp(optarg, "ldraw") == 0) {
5103				Validation_Flags = O_LDRAW;
5104			} else if (strcasecmp(optarg, "raw") == 0) {
5105				Validation_Flags = O_RAW;
5106#endif
5107#ifdef sgi
5108			} else if (strcasecmp(optarg, "direct") == 0) {
5109				Validation_Flags = O_DIRECT;
5110#endif
5111			} else {
5112				if (sscanf
5113				    (optarg, "%i%c", &Validation_Flags,
5114				     &cc) != 1) {
5115					fprintf(stderr,
5116						"%s:  Invalid -V argument (%s) - must be a decimal, hex, or octal\n",
5117						Prog, optarg);
5118					fprintf(stderr,
5119						"    number, or one of the following strings:  'sync',\n");
5120					fprintf(stderr,
5121						"    'buffered', 'parallel', 'ldraw', or 'raw'\n");
5122					exit(E_USAGE);
5123				}
5124			}
5125			V_opt++;
5126			break;
5127		case 'U':
5128			tok = strtok(optarg, ",");
5129			while (tok != NULL) {
5130				for (s = Upanic_Args; s->string != NULL; s++)
5131					if (strcmp(s->string, tok) == 0)
5132						break;
5133
5134				if (s->string == NULL) {
5135					fprintf(stderr,
5136						"%s%s:  Illegal -U arg (%s).  Must be one of: ",
5137						Prog, TagName, tok);
5138
5139					for (s = Upanic_Args; s->string != NULL;
5140					     s++)
5141						fprintf(stderr, "%s ",
5142							s->string);
5143
5144					fprintf(stderr, "\n");
5145
5146					exit(1);
5147				}
5148
5149				Upanic_Conditions |= s->value;
5150				tok = strtok(NULL, ",");
5151			}
5152
5153			U_opt++;
5154			break;
5155
5156		case '?':
5157			usage(stderr);
5158			exit(E_USAGE);
5159			break;
5160		}
5161	}
5162
5163	/*
5164	 * Supply defaults
5165	 */
5166
5167	if (!C_opt) {
5168		Data_Fill = doio_pat_fill;
5169		Data_Check = doio_pat_check;
5170	}
5171
5172	if (!U_opt)
5173		Upanic_Conditions = 0;
5174
5175	if (!n_opt)
5176		Nprocs = 1;
5177
5178	if (!r_opt)
5179		Release_Interval = DEF_RELEASE_INTERVAL;
5180
5181	if (!M_opt) {
5182		Memalloc[Nmemalloc].memtype = MEM_DATA;
5183		Memalloc[Nmemalloc].flags = 0;
5184		Memalloc[Nmemalloc].name = NULL;
5185		Memalloc[Nmemalloc].space = NULL;
5186		Nmemalloc++;
5187	}
5188
5189	/*
5190	 * Initialize input stream
5191	 */
5192
5193	if (argc == optind) {
5194		Infile = NULL;
5195	} else {
5196		Infile = argv[optind++];
5197	}
5198
5199	if (argc != optind) {
5200		usage(stderr);
5201		exit(E_USAGE);
5202	}
5203
5204	return 0;
5205}
5206
5207/*
5208 * Parse memory allocation types
5209 *
5210 * Types are:
5211 *  Data
5212 *  T3E-shmem:blksize[:nblks]
5213 *  SysV-shmem:shmid:blksize:nblks
5214 *	if shmid is "private", use IPC_PRIVATE
5215 *	and nblks is not required
5216 *
5217 *  mmap:flags:filename:blksize[:nblks]
5218 *   flags are one of:
5219 *	p - private (MAP_PRIVATE)
5220 *	a - private, MAP_AUTORESRV
5221 *	l - local (MAP_LOCAL)
5222 *	s - shared (nblks required)
5223 *
5224 *   plus any of:
5225 *	f - fixed address (MAP_FIXED)
5226 *	A - use an address without MAP_FIXED
5227 *	a - autogrow (map once at startup)
5228 *
5229 *  mmap:flags:devzero
5230 *	mmap /dev/zero  (shared not allowd)
5231 *	maps the first 4096 bytes of /dev/zero
5232 *
5233 * - put a directory at the beginning of the shared
5234 *   regions saying what pid has what region.
5235 *	DIRMAGIC
5236 *	BLKSIZE
5237 *	NBLKS
5238 *	nblks worth of directories - 1 int pids
5239 */
5240#ifndef CRAY
5241void parse_memalloc(char *arg)
5242{
5243	char *allocargs[NMEMALLOC];
5244	int nalloc;
5245	struct memalloc *M;
5246
5247	if (Nmemalloc >= NMEMALLOC) {
5248		doio_fprintf(stderr, "Error - too many memory types (%d).\n",
5249			     Nmemalloc);
5250		return;
5251	}
5252
5253	M = &Memalloc[Nmemalloc];
5254
5255	nalloc = string_to_tokens(arg, allocargs, 32, ":");
5256	if (!strcmp(allocargs[0], "data")) {
5257		M->memtype = MEM_DATA;
5258		M->flags = 0;
5259		M->name = NULL;
5260		M->space = NULL;
5261		Nmemalloc++;
5262		if (nalloc >= 2) {
5263			if (strchr(allocargs[1], 'p'))
5264				M->flags |= MEMF_MPIN;
5265		}
5266	} else if (!strcmp(allocargs[0], "mmap")) {
5267		/* mmap:flags:filename[:size] */
5268		M->memtype = MEM_MMAP;
5269		M->flags = 0;
5270		M->space = NULL;
5271		if (nalloc >= 1) {
5272			if (strchr(allocargs[1], 'p'))
5273				M->flags |= MEMF_PRIVATE;
5274			if (strchr(allocargs[1], 'a'))
5275				M->flags |= MEMF_AUTORESRV;
5276			if (strchr(allocargs[1], 'l'))
5277				M->flags |= MEMF_LOCAL;
5278			if (strchr(allocargs[1], 's'))
5279				M->flags |= MEMF_SHARED;
5280
5281			if (strchr(allocargs[1], 'f'))
5282				M->flags |= MEMF_FIXADDR;
5283			if (strchr(allocargs[1], 'A'))
5284				M->flags |= MEMF_ADDR;
5285			if (strchr(allocargs[1], 'G'))
5286				M->flags |= MEMF_AUTOGROW;
5287
5288			if (strchr(allocargs[1], 'U'))
5289				M->flags |= MEMF_FILE;
5290		} else {
5291			M->flags |= MEMF_PRIVATE;
5292		}
5293
5294		if (nalloc > 2) {
5295			if (!strcmp(allocargs[2], "devzero")) {
5296				M->name = "/dev/zero";
5297				if (M->flags &
5298				    ((MEMF_PRIVATE | MEMF_LOCAL) == 0))
5299					M->flags |= MEMF_PRIVATE;
5300			} else {
5301				M->name = allocargs[2];
5302			}
5303		} else {
5304			M->name = "/dev/zero";
5305			if (M->flags & ((MEMF_PRIVATE | MEMF_LOCAL) == 0))
5306				M->flags |= MEMF_PRIVATE;
5307		}
5308		Nmemalloc++;
5309
5310	} else if (!strcmp(allocargs[0], "shmem")) {
5311		/* shmem:shmid:size */
5312		M->memtype = MEM_SHMEM;
5313		M->flags = 0;
5314		M->space = NULL;
5315		if (nalloc >= 2) {
5316			M->name = allocargs[1];
5317		} else {
5318			M->name = NULL;
5319		}
5320		if (nalloc >= 3) {
5321			sscanf(allocargs[2], "%i", &M->nblks);
5322		} else {
5323			M->nblks = 0;
5324		}
5325		if (nalloc >= 4) {
5326			if (strchr(allocargs[3], 'p'))
5327				M->flags |= MEMF_MPIN;
5328		}
5329
5330		Nmemalloc++;
5331	} else {
5332		doio_fprintf(stderr, "Error - unknown memory type '%s'.\n",
5333			     allocargs[0]);
5334		exit(1);
5335	}
5336}
5337
5338void dump_memalloc(void)
5339{
5340	int ma;
5341	char *mt;
5342
5343	if (Nmemalloc == 0) {
5344		printf("No memory allocation strategies devined\n");
5345		return;
5346	}
5347
5348	for (ma = 0; ma < Nmemalloc; ma++) {
5349		switch (Memalloc[ma].memtype) {
5350		case MEM_DATA:
5351			mt = "data";
5352			break;
5353		case MEM_SHMEM:
5354			mt = "shmem";
5355			break;
5356		case MEM_MMAP:
5357			mt = "mmap";
5358			break;
5359		default:
5360			mt = "unknown";
5361			break;
5362		}
5363		printf("mstrat[%d] = %d %s\n", ma, Memalloc[ma].memtype, mt);
5364		printf("\tflags=%#o name='%s' nblks=%d\n",
5365		       Memalloc[ma].flags,
5366		       Memalloc[ma].name, Memalloc[ma].nblks);
5367	}
5368}
5369
5370#endif /* !CRAY */
5371
5372/*
5373 * -d <op>:<time> - doio inter-operation delay
5374 *	currently this permits ONE type of delay between operations.
5375 */
5376
5377void parse_delay(char *arg)
5378{
5379	char *delayargs[NMEMALLOC];
5380	int ndelay;
5381	struct smap *s;
5382
5383	ndelay = string_to_tokens(arg, delayargs, 32, ":");
5384	if (ndelay < 2) {
5385		doio_fprintf(stderr,
5386			     "Illegal delay arg (%s). Must be operation:time\n",
5387			     arg);
5388		exit(1);
5389	}
5390	for (s = delaymap; s->string != NULL; s++)
5391		if (!strcmp(s->string, delayargs[0]))
5392			break;
5393	if (s->string == NULL) {
5394		fprintf(stderr,
5395			"Illegal Delay arg (%s).  Must be one of: ", arg);
5396
5397		for (s = delaymap; s->string != NULL; s++)
5398			fprintf(stderr, "%s ", s->string);
5399		fprintf(stderr, "\n");
5400		exit(1);
5401	}
5402
5403	delayop = s->value;
5404
5405	sscanf(delayargs[1], "%i", &delaytime);
5406
5407	if (ndelay > 2) {
5408		fprintf(stderr, "Warning: extra delay arguments ignored.\n");
5409	}
5410}
5411
5412/*
5413 * Usage clause - obvious
5414 */
5415
5416int usage(FILE * stream)
5417{
5418	/*
5419	 * Only do this if we are on vpe 0, to avoid seeing it from every
5420	 * process in the application.
5421	 */
5422
5423	if (Npes > 1 && Vpe != 0) {
5424		return 0;
5425	}
5426
5427	fprintf(stream,
5428		"usage%s:  %s [-aekv] [-m message_interval] [-n nprocs] [-r release_interval] [-w write_log] [-V validation_ftype] [-U upanic_cond] [infile]\n",
5429		TagName, Prog);
5430	return 0;
5431}
5432
5433void help(FILE * stream)
5434{
5435	/*
5436	 * Only the app running on vpe 0 gets to issue help - this prevents
5437	 * everybody in the application from doing this.
5438	 */
5439
5440	if (Npes > 1 && Vpe != 0) {
5441		return;
5442	}
5443
5444	usage(stream);
5445	fprintf(stream, "\n");
5446	fprintf(stream,
5447		"\t-a                   abort - kill all doio processes on data compare\n");
5448	fprintf(stream,
5449		"\t                     errors.  Normally only the erroring process exits\n");
5450	fprintf(stream, "\t-C data-pattern-type \n");
5451	fprintf(stream,
5452		"\t                     Available data patterns are:\n");
5453	fprintf(stream, "\t                     default - repeating pattern\n");
5454	fprintf(stream, "\t-d Operation:Time    Inter-operation delay.\n");
5455	fprintf(stream, "\t                     Operations are:\n");
5456	fprintf(stream,
5457		"\t                         select:time (1 second=1000000)\n");
5458	fprintf(stream, "\t                         sleep:time (1 second=1)\n");
5459#ifdef sgi
5460	fprintf(stream,
5461		"\t                         sginap:time (1 second=CLK_TCK=100)\n");
5462#endif
5463	fprintf(stream, "\t                         alarm:time (1 second=1)\n");
5464	fprintf(stream,
5465		"\t-e                   Re-exec children before entering the main\n");
5466	fprintf(stream,
5467		"\t                     loop.  This is useful for spreading\n");
5468	fprintf(stream,
5469		"\t                     procs around on multi-pe systems.\n");
5470	fprintf(stream,
5471		"\t-k                   Lock file regions during writes using fcntl()\n");
5472	fprintf(stream,
5473		"\t-v                   Verify writes - this is done by doing a buffered\n");
5474	fprintf(stream,
5475		"\t                     read() of the data if file io was done, or\n");
5476	fprintf(stream,
5477		"\t                     an ssread()of the data if sds io was done\n");
5478#ifndef CRAY
5479	fprintf(stream,
5480		"\t-M                   Data buffer allocation method\n");
5481	fprintf(stream, "\t                     alloc-type[,type]\n");
5482#ifdef sgi
5483	fprintf(stream, "\t			    data:flags\n");
5484	fprintf(stream, "\t			        p - mpin buffer\n");
5485	fprintf(stream, "\t			    shmem:shmid:size:flags\n");
5486	fprintf(stream, "\t			        p - mpin buffer\n");
5487#else
5488	fprintf(stream, "\t			    data\n");
5489	fprintf(stream, "\t			    shmem:shmid:size\n");
5490#endif /* sgi */
5491	fprintf(stream, "\t			    mmap:flags:filename\n");
5492	fprintf(stream, "\t			        p - private\n");
5493#ifdef sgi
5494	fprintf(stream, "\t			        s - shared\n");
5495	fprintf(stream, "\t			        l - local\n");
5496	fprintf(stream, "\t			        a - autoresrv\n");
5497	fprintf(stream, "\t			        G - autogrow\n");
5498#else
5499	fprintf(stream,
5500		"\t			        s - shared (shared file must exist\n"),
5501	    fprintf(stream,
5502		    "\t			            and have needed length)\n");
5503#endif
5504	fprintf(stream,
5505		"\t			        f - fixed address (not used)\n");
5506	fprintf(stream,
5507		"\t			        a - specify address (not used)\n");
5508	fprintf(stream,
5509		"\t			        U - Unlink file when done\n");
5510	fprintf(stream,
5511		"\t			        The default flag is private\n");
5512	fprintf(stream, "\n");
5513#endif /* !CRAY */
5514	fprintf(stream,
5515		"\t-m message_interval  Generate a message every 'message_interval'\n");
5516	fprintf(stream,
5517		"\t                     requests.  An interval of 0 suppresses\n");
5518	fprintf(stream,
5519		"\t                     messages.  The default is 0.\n");
5520	fprintf(stream, "\t-N tagname           Tag name, for Monster.\n");
5521	fprintf(stream, "\t-n nprocs            # of processes to start up\n");
5522	fprintf(stream,
5523		"\t-r release_interval  Release all memory and close\n");
5524	fprintf(stream,
5525		"\t                     files every release_interval operations.\n");
5526	fprintf(stream,
5527		"\t                     By default procs never release memory\n");
5528	fprintf(stream,
5529		"\t                     or close fds unless they have to.\n");
5530	fprintf(stream,
5531		"\t-V validation_ftype  The type of file descriptor to use for doing data\n");
5532	fprintf(stream,
5533		"\t                     validation.  validation_ftype may be an octal,\n");
5534	fprintf(stream,
5535		"\t                     hex, or decimal number representing the open()\n");
5536	fprintf(stream,
5537		"\t                     flags, or may be one of the following strings:\n");
5538	fprintf(stream,
5539		"\t                     'buffered' - validate using bufferd read\n");
5540	fprintf(stream,
5541		"\t                     'sync'     - validate using O_SYNC read\n");
5542#ifdef sgi
5543	fprintf(stream,
5544		"\t                     'direct    - validate using O_DIRECT read'\n");
5545#endif
5546#ifdef CRAY
5547	fprintf(stream,
5548		"\t                     'ldraw'    - validate using O_LDRAW read\n");
5549	fprintf(stream,
5550		"\t                     'parallel' - validate using O_PARALLEL read\n");
5551	fprintf(stream,
5552		"\t                     'raw'      - validate using O_RAW read\n");
5553#endif
5554	fprintf(stream, "\t                     By default, 'parallel'\n");
5555	fprintf(stream,
5556		"\t                     is used if the write was done with O_PARALLEL\n");
5557	fprintf(stream,
5558		"\t                     or 'buffered' for all other writes.\n");
5559	fprintf(stream,
5560		"\t-w write_log         File to log file writes to.  The doio_check\n");
5561	fprintf(stream,
5562		"\t                     program can reconstruct datafiles using the\n");
5563	fprintf(stream,
5564		"\t                     write_log, and detect if a file is corrupt\n");
5565	fprintf(stream,
5566		"\t                     after all procs have exited.\n");
5567	fprintf(stream,
5568		"\t-U upanic_cond       Comma separated list of conditions that will\n");
5569	fprintf(stream,
5570		"\t                     cause a call to upanic(PA_PANIC).\n");
5571	fprintf(stream,
5572		"\t                     'corruption' -> upanic on bad data comparisons\n");
5573	fprintf(stream,
5574		"\t                     'iosw'     ---> upanic on unexpected async iosw\n");
5575	fprintf(stream,
5576		"\t                     'rval'     ---> upanic on unexpected syscall rvals\n");
5577	fprintf(stream,
5578		"\t                     'all'      ---> all of the above\n");
5579	fprintf(stream, "\n");
5580	fprintf(stream,
5581		"\tinfile               Input stream - default is stdin - must be a list\n");
5582	fprintf(stream,
5583		"\t                     of io_req structures (see doio.h).  Currently\n");
5584	fprintf(stream,
5585		"\t                     only the iogen program generates the proper\n");
5586	fprintf(stream, "\t                     format\n");
5587}
5588