1/*
2** 2005 December 14
3**
4** The author disclaims copyright to this source code.  In place of
5** a legal notice, here is a blessing:
6**
7**    May you do good and not evil.
8**    May you find forgiveness for yourself and forgive others.
9**    May you share freely, never taking more than you give.
10**
11*************************************************************************
12**
13** $Id: sqlite3async.c,v 1.7 2009/07/18 11:52:04 danielk1977 Exp $
14**
15** This file contains the implementation of an asynchronous IO backend
16** for SQLite.
17*/
18
19#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_ASYNCIO)
20
21#include "sqlite3async.h"
22#include "sqlite3.h"
23#include <stdarg.h>
24#include <string.h>
25#include <assert.h>
26
27/* Useful macros used in several places */
28#define MIN(x,y) ((x)<(y)?(x):(y))
29#define MAX(x,y) ((x)>(y)?(x):(y))
30
31#ifndef SQLITE_AMALGAMATION
32/* Macro to mark parameters as unused and silence compiler warnings. */
33#define UNUSED_PARAMETER(x) (void)(x)
34#endif
35
36/* Forward references */
37typedef struct AsyncWrite AsyncWrite;
38typedef struct AsyncFile AsyncFile;
39typedef struct AsyncFileData AsyncFileData;
40typedef struct AsyncFileLock AsyncFileLock;
41typedef struct AsyncLock AsyncLock;
42
43/* Enable for debugging */
44#ifndef NDEBUG
45#include <stdio.h>
46static int sqlite3async_trace = 0;
47# define ASYNC_TRACE(X) if( sqlite3async_trace ) asyncTrace X
48static void asyncTrace(const char *zFormat, ...){
49  char *z;
50  va_list ap;
51  va_start(ap, zFormat);
52  z = sqlite3_vmprintf(zFormat, ap);
53  va_end(ap);
54  fprintf(stderr, "[%d] %s", 0 /* (int)pthread_self() */, z);
55  sqlite3_free(z);
56}
57#else
58# define ASYNC_TRACE(X)
59#endif
60
61/*
62** THREAD SAFETY NOTES
63**
64** Basic rules:
65**
66**     * Both read and write access to the global write-op queue must be
67**       protected by the async.queueMutex. As are the async.ioError and
68**       async.nFile variables.
69**
70**     * The async.pLock list and all AsyncLock and AsyncFileLock
71**       structures must be protected by the async.lockMutex mutex.
72**
73**     * The file handles from the underlying system are not assumed to
74**       be thread safe.
75**
76**     * See the last two paragraphs under "The Writer Thread" for
77**       an assumption to do with file-handle synchronization by the Os.
78**
79** Deadlock prevention:
80**
81**     There are three mutex used by the system: the "writer" mutex,
82**     the "queue" mutex and the "lock" mutex. Rules are:
83**
84**     * It is illegal to block on the writer mutex when any other mutex
85**       are held, and
86**
87**     * It is illegal to block on the queue mutex when the lock mutex
88**       is held.
89**
90**     i.e. mutex's must be grabbed in the order "writer", "queue", "lock".
91**
92** File system operations (invoked by SQLite thread):
93**
94**     xOpen
95**     xDelete
96**     xFileExists
97**
98** File handle operations (invoked by SQLite thread):
99**
100**         asyncWrite, asyncClose, asyncTruncate, asyncSync
101**
102**     The operations above add an entry to the global write-op list. They
103**     prepare the entry, acquire the async.queueMutex momentarily while
104**     list pointers are  manipulated to insert the new entry, then release
105**     the mutex and signal the writer thread to wake up in case it happens
106**     to be asleep.
107**
108**
109**         asyncRead, asyncFileSize.
110**
111**     Read operations. Both of these read from both the underlying file
112**     first then adjust their result based on pending writes in the
113**     write-op queue.   So async.queueMutex is held for the duration
114**     of these operations to prevent other threads from changing the
115**     queue in mid operation.
116**
117**
118**         asyncLock, asyncUnlock, asyncCheckReservedLock
119**
120**     These primitives implement in-process locking using a hash table
121**     on the file name.  Files are locked correctly for connections coming
122**     from the same process.  But other processes cannot see these locks
123**     and will therefore not honor them.
124**
125**
126** The writer thread:
127**
128**     The async.writerMutex is used to make sure only there is only
129**     a single writer thread running at a time.
130**
131**     Inside the writer thread is a loop that works like this:
132**
133**         WHILE (write-op list is not empty)
134**             Do IO operation at head of write-op list
135**             Remove entry from head of write-op list
136**         END WHILE
137**
138**     The async.queueMutex is always held during the <write-op list is
139**     not empty> test, and when the entry is removed from the head
140**     of the write-op list. Sometimes it is held for the interim
141**     period (while the IO is performed), and sometimes it is
142**     relinquished. It is relinquished if (a) the IO op is an
143**     ASYNC_CLOSE or (b) when the file handle was opened, two of
144**     the underlying systems handles were opened on the same
145**     file-system entry.
146**
147**     If condition (b) above is true, then one file-handle
148**     (AsyncFile.pBaseRead) is used exclusively by sqlite threads to read the
149**     file, the other (AsyncFile.pBaseWrite) by sqlite3_async_flush()
150**     threads to perform write() operations. This means that read
151**     operations are not blocked by asynchronous writes (although
152**     asynchronous writes may still be blocked by reads).
153**
154**     This assumes that the OS keeps two handles open on the same file
155**     properly in sync. That is, any read operation that starts after a
156**     write operation on the same file system entry has completed returns
157**     data consistent with the write. We also assume that if one thread
158**     reads a file while another is writing it all bytes other than the
159**     ones actually being written contain valid data.
160**
161**     If the above assumptions are not true, set the preprocessor symbol
162**     SQLITE_ASYNC_TWO_FILEHANDLES to 0.
163*/
164
165
166#ifndef NDEBUG
167# define TESTONLY( X ) X
168#else
169# define TESTONLY( X )
170#endif
171
172/*
173** PORTING FUNCTIONS
174**
175** There are two definitions of the following functions. One for pthreads
176** compatible systems and one for Win32. These functions isolate the OS
177** specific code required by each platform.
178**
179** The system uses three mutexes and a single condition variable. To
180** block on a mutex, async_mutex_enter() is called. The parameter passed
181** to async_mutex_enter(), which must be one of ASYNC_MUTEX_LOCK,
182** ASYNC_MUTEX_QUEUE or ASYNC_MUTEX_WRITER, identifies which of the three
183** mutexes to lock. Similarly, to unlock a mutex, async_mutex_leave() is
184** called with a parameter identifying the mutex being unlocked. Mutexes
185** are not recursive - it is an error to call async_mutex_enter() to
186** lock a mutex that is already locked, or to call async_mutex_leave()
187** to unlock a mutex that is not currently locked.
188**
189** The async_cond_wait() and async_cond_signal() functions are modelled
190** on the pthreads functions with similar names. The first parameter to
191** both functions is always ASYNC_COND_QUEUE. When async_cond_wait()
192** is called the mutex identified by the second parameter must be held.
193** The mutex is unlocked, and the calling thread simultaneously begins
194** waiting for the condition variable to be signalled by another thread.
195** After another thread signals the condition variable, the calling
196** thread stops waiting, locks mutex eMutex and returns. The
197** async_cond_signal() function is used to signal the condition variable.
198** It is assumed that the mutex used by the thread calling async_cond_wait()
199** is held by the caller of async_cond_signal() (otherwise there would be
200** a race condition).
201**
202** It is guaranteed that no other thread will call async_cond_wait() when
203** there is already a thread waiting on the condition variable.
204**
205** The async_sched_yield() function is called to suggest to the operating
206** system that it would be a good time to shift the current thread off the
207** CPU. The system will still work if this function is not implemented
208** (it is not currently implemented for win32), but it might be marginally
209** more efficient if it is.
210*/
211static void async_mutex_enter(int eMutex);
212static void async_mutex_leave(int eMutex);
213static void async_cond_wait(int eCond, int eMutex);
214static void async_cond_signal(int eCond);
215static void async_sched_yield(void);
216
217/*
218** There are also two definitions of the following. async_os_initialize()
219** is called when the asynchronous VFS is first installed, and os_shutdown()
220** is called when it is uninstalled (from within sqlite3async_shutdown()).
221**
222** For pthreads builds, both of these functions are no-ops. For win32,
223** they provide an opportunity to initialize and finalize the required
224** mutex and condition variables.
225**
226** If async_os_initialize() returns other than zero, then the initialization
227** fails and SQLITE_ERROR is returned to the user.
228*/
229static int async_os_initialize(void);
230static void async_os_shutdown(void);
231
232/* Values for use as the 'eMutex' argument of the above functions. The
233** integer values assigned to these constants are important for assert()
234** statements that verify that mutexes are locked in the correct order.
235** Specifically, it is unsafe to try to lock mutex N while holding a lock
236** on mutex M if (M<=N).
237*/
238#define ASYNC_MUTEX_LOCK    0
239#define ASYNC_MUTEX_QUEUE   1
240#define ASYNC_MUTEX_WRITER  2
241
242/* Values for use as the 'eCond' argument of the above functions. */
243#define ASYNC_COND_QUEUE    0
244
245/*************************************************************************
246** Start of OS specific code.
247*/
248#if SQLITE_OS_WIN || defined(_WIN32) || defined(WIN32) || defined(__CYGWIN__) || defined(__MINGW32__) || defined(__BORLANDC__)
249
250#include <windows.h>
251
252/* The following block contains the win32 specific code. */
253
254#define mutex_held(X) (GetCurrentThreadId()==primitives.aHolder[X])
255
256static struct AsyncPrimitives {
257  int isInit;
258  DWORD aHolder[3];
259  CRITICAL_SECTION aMutex[3];
260  HANDLE aCond[1];
261} primitives = { 0 };
262
263static int async_os_initialize(void){
264  if( !primitives.isInit ){
265    primitives.aCond[0] = CreateEvent(NULL, TRUE, FALSE, 0);
266    if( primitives.aCond[0]==NULL ){
267      return 1;
268    }
269    InitializeCriticalSection(&primitives.aMutex[0]);
270    InitializeCriticalSection(&primitives.aMutex[1]);
271    InitializeCriticalSection(&primitives.aMutex[2]);
272    primitives.isInit = 1;
273  }
274  return 0;
275}
276static void async_os_shutdown(void){
277  if( primitives.isInit ){
278    DeleteCriticalSection(&primitives.aMutex[0]);
279    DeleteCriticalSection(&primitives.aMutex[1]);
280    DeleteCriticalSection(&primitives.aMutex[2]);
281    CloseHandle(primitives.aCond[0]);
282    primitives.isInit = 0;
283  }
284}
285
286/* The following block contains the Win32 specific code. */
287static void async_mutex_enter(int eMutex){
288  assert( eMutex==0 || eMutex==1 || eMutex==2 );
289  assert( eMutex!=2 || (!mutex_held(0) && !mutex_held(1) && !mutex_held(2)) );
290  assert( eMutex!=1 || (!mutex_held(0) && !mutex_held(1)) );
291  assert( eMutex!=0 || (!mutex_held(0)) );
292  EnterCriticalSection(&primitives.aMutex[eMutex]);
293  TESTONLY( primitives.aHolder[eMutex] = GetCurrentThreadId(); )
294}
295static void async_mutex_leave(int eMutex){
296  assert( eMutex==0 || eMutex==1 || eMutex==2 );
297  assert( mutex_held(eMutex) );
298  TESTONLY( primitives.aHolder[eMutex] = 0; )
299  LeaveCriticalSection(&primitives.aMutex[eMutex]);
300}
301static void async_cond_wait(int eCond, int eMutex){
302  ResetEvent(primitives.aCond[eCond]);
303  async_mutex_leave(eMutex);
304  WaitForSingleObject(primitives.aCond[eCond], INFINITE);
305  async_mutex_enter(eMutex);
306}
307static void async_cond_signal(int eCond){
308  assert( mutex_held(ASYNC_MUTEX_QUEUE) );
309  SetEvent(primitives.aCond[eCond]);
310}
311static void async_sched_yield(void){
312  Sleep(0);
313}
314#else
315
316/* The following block contains the pthreads specific code. */
317#include <pthread.h>
318#include <sched.h>
319
320#define mutex_held(X) pthread_equal(primitives.aHolder[X], pthread_self())
321
322static int  async_os_initialize(void) {return 0;}
323static void async_os_shutdown(void) {}
324
325static struct AsyncPrimitives {
326  pthread_mutex_t aMutex[3];
327  pthread_cond_t aCond[1];
328  pthread_t aHolder[3];
329} primitives = {
330  { PTHREAD_MUTEX_INITIALIZER,
331    PTHREAD_MUTEX_INITIALIZER,
332    PTHREAD_MUTEX_INITIALIZER
333  } , {
334    PTHREAD_COND_INITIALIZER
335  } , { 0, 0, 0 }
336};
337
338static void async_mutex_enter(int eMutex){
339  assert( eMutex==0 || eMutex==1 || eMutex==2 );
340  assert( eMutex!=2 || (!mutex_held(0) && !mutex_held(1) && !mutex_held(2)) );
341  assert( eMutex!=1 || (!mutex_held(0) && !mutex_held(1)) );
342  assert( eMutex!=0 || (!mutex_held(0)) );
343  pthread_mutex_lock(&primitives.aMutex[eMutex]);
344  TESTONLY( primitives.aHolder[eMutex] = pthread_self(); )
345}
346static void async_mutex_leave(int eMutex){
347  assert( eMutex==0 || eMutex==1 || eMutex==2 );
348  assert( mutex_held(eMutex) );
349  TESTONLY( primitives.aHolder[eMutex] = 0; )
350  pthread_mutex_unlock(&primitives.aMutex[eMutex]);
351}
352static void async_cond_wait(int eCond, int eMutex){
353  assert( eMutex==0 || eMutex==1 || eMutex==2 );
354  assert( mutex_held(eMutex) );
355  TESTONLY( primitives.aHolder[eMutex] = 0; )
356  pthread_cond_wait(&primitives.aCond[eCond], &primitives.aMutex[eMutex]);
357  TESTONLY( primitives.aHolder[eMutex] = pthread_self(); )
358}
359static void async_cond_signal(int eCond){
360  assert( mutex_held(ASYNC_MUTEX_QUEUE) );
361  pthread_cond_signal(&primitives.aCond[eCond]);
362}
363static void async_sched_yield(void){
364  sched_yield();
365}
366#endif
367/*
368** End of OS specific code.
369*************************************************************************/
370
371#define assert_mutex_is_held(X) assert( mutex_held(X) )
372
373
374#ifndef SQLITE_ASYNC_TWO_FILEHANDLES
375/* #define SQLITE_ASYNC_TWO_FILEHANDLES 0 */
376#define SQLITE_ASYNC_TWO_FILEHANDLES 1
377#endif
378
379/*
380** State information is held in the static variable "async" defined
381** as the following structure.
382**
383** Both async.ioError and async.nFile are protected by async.queueMutex.
384*/
385static struct TestAsyncStaticData {
386  AsyncWrite *pQueueFirst;     /* Next write operation to be processed */
387  AsyncWrite *pQueueLast;      /* Last write operation on the list */
388  AsyncLock *pLock;            /* Linked list of all AsyncLock structures */
389  volatile int ioDelay;        /* Extra delay between write operations */
390  volatile int eHalt;          /* One of the SQLITEASYNC_HALT_XXX values */
391  volatile int bLockFiles;     /* Current value of "lockfiles" parameter */
392  int ioError;                 /* True if an IO error has occurred */
393  int nFile;                   /* Number of open files (from sqlite pov) */
394} async = { 0,0,0,0,0,1,0,0 };
395
396/* Possible values of AsyncWrite.op */
397#define ASYNC_NOOP          0
398#define ASYNC_WRITE         1
399#define ASYNC_SYNC          2
400#define ASYNC_TRUNCATE      3
401#define ASYNC_CLOSE         4
402#define ASYNC_DELETE        5
403#define ASYNC_OPENEXCLUSIVE 6
404#define ASYNC_UNLOCK        7
405
406/* Names of opcodes.  Used for debugging only.
407** Make sure these stay in sync with the macros above!
408*/
409static const char *azOpcodeName[] = {
410  "NOOP", "WRITE", "SYNC", "TRUNCATE", "CLOSE", "DELETE", "OPENEX", "UNLOCK"
411};
412
413/*
414** Entries on the write-op queue are instances of the AsyncWrite
415** structure, defined here.
416**
417** The interpretation of the iOffset and nByte variables varies depending
418** on the value of AsyncWrite.op:
419**
420** ASYNC_NOOP:
421**     No values used.
422**
423** ASYNC_WRITE:
424**     iOffset -> Offset in file to write to.
425**     nByte   -> Number of bytes of data to write (pointed to by zBuf).
426**
427** ASYNC_SYNC:
428**     nByte   -> flags to pass to sqlite3OsSync().
429**
430** ASYNC_TRUNCATE:
431**     iOffset -> Size to truncate file to.
432**     nByte   -> Unused.
433**
434** ASYNC_CLOSE:
435**     iOffset -> Unused.
436**     nByte   -> Unused.
437**
438** ASYNC_DELETE:
439**     iOffset -> Contains the "syncDir" flag.
440**     nByte   -> Number of bytes of zBuf points to (file name).
441**
442** ASYNC_OPENEXCLUSIVE:
443**     iOffset -> Value of "delflag".
444**     nByte   -> Number of bytes of zBuf points to (file name).
445**
446** ASYNC_UNLOCK:
447**     nByte   -> Argument to sqlite3OsUnlock().
448**
449**
450** For an ASYNC_WRITE operation, zBuf points to the data to write to the file.
451** This space is sqlite3_malloc()d along with the AsyncWrite structure in a
452** single blob, so is deleted when sqlite3_free() is called on the parent
453** structure.
454*/
455struct AsyncWrite {
456  AsyncFileData *pFileData;    /* File to write data to or sync */
457  int op;                      /* One of ASYNC_xxx etc. */
458  sqlite_int64 iOffset;        /* See above */
459  int nByte;          /* See above */
460  char *zBuf;         /* Data to write to file (or NULL if op!=ASYNC_WRITE) */
461  AsyncWrite *pNext;  /* Next write operation (to any file) */
462};
463
464/*
465** An instance of this structure is created for each distinct open file
466** (i.e. if two handles are opened on the one file, only one of these
467** structures is allocated) and stored in the async.aLock hash table. The
468** keys for async.aLock are the full pathnames of the opened files.
469**
470** AsyncLock.pList points to the head of a linked list of AsyncFileLock
471** structures, one for each handle currently open on the file.
472**
473** If the opened file is not a main-database (the SQLITE_OPEN_MAIN_DB is
474** not passed to the sqlite3OsOpen() call), or if async.bLockFiles is
475** false, variables AsyncLock.pFile and AsyncLock.eLock are never used.
476** Otherwise, pFile is a file handle opened on the file in question and
477** used to obtain the file-system locks required by database connections
478** within this process.
479**
480** See comments above the asyncLock() function for more details on
481** the implementation of database locking used by this backend.
482*/
483struct AsyncLock {
484  char *zFile;
485  int nFile;
486  sqlite3_file *pFile;
487  int eLock;
488  AsyncFileLock *pList;
489  AsyncLock *pNext;           /* Next in linked list headed by async.pLock */
490};
491
492/*
493** An instance of the following structure is allocated along with each
494** AsyncFileData structure (see AsyncFileData.lock), but is only used if the
495** file was opened with the SQLITE_OPEN_MAIN_DB.
496*/
497struct AsyncFileLock {
498  int eLock;                /* Internally visible lock state (sqlite pov) */
499  int eAsyncLock;           /* Lock-state with write-queue unlock */
500  AsyncFileLock *pNext;
501};
502
503/*
504** The AsyncFile structure is a subclass of sqlite3_file used for
505** asynchronous IO.
506**
507** All of the actual data for the structure is stored in the structure
508** pointed to by AsyncFile.pData, which is allocated as part of the
509** sqlite3OsOpen() using sqlite3_malloc(). The reason for this is that the
510** lifetime of the AsyncFile structure is ended by the caller after OsClose()
511** is called, but the data in AsyncFileData may be required by the
512** writer thread after that point.
513*/
514struct AsyncFile {
515  sqlite3_io_methods *pMethod;
516  AsyncFileData *pData;
517};
518struct AsyncFileData {
519  char *zName;               /* Underlying OS filename - used for debugging */
520  int nName;                 /* Number of characters in zName */
521  sqlite3_file *pBaseRead;   /* Read handle to the underlying Os file */
522  sqlite3_file *pBaseWrite;  /* Write handle to the underlying Os file */
523  AsyncFileLock lock;        /* Lock state for this handle */
524  AsyncLock *pLock;          /* AsyncLock object for this file system entry */
525  AsyncWrite closeOp;        /* Preallocated close operation */
526};
527
528/*
529** Add an entry to the end of the global write-op list. pWrite should point
530** to an AsyncWrite structure allocated using sqlite3_malloc().  The writer
531** thread will call sqlite3_free() to free the structure after the specified
532** operation has been completed.
533**
534** Once an AsyncWrite structure has been added to the list, it becomes the
535** property of the writer thread and must not be read or modified by the
536** caller.
537*/
538static void addAsyncWrite(AsyncWrite *pWrite){
539  /* We must hold the queue mutex in order to modify the queue pointers */
540  if( pWrite->op!=ASYNC_UNLOCK ){
541    async_mutex_enter(ASYNC_MUTEX_QUEUE);
542  }
543
544  /* Add the record to the end of the write-op queue */
545  assert( !pWrite->pNext );
546  if( async.pQueueLast ){
547    assert( async.pQueueFirst );
548    async.pQueueLast->pNext = pWrite;
549  }else{
550    async.pQueueFirst = pWrite;
551  }
552  async.pQueueLast = pWrite;
553  ASYNC_TRACE(("PUSH %p (%s %s %d)\n", pWrite, azOpcodeName[pWrite->op],
554         pWrite->pFileData ? pWrite->pFileData->zName : "-", pWrite->iOffset));
555
556  if( pWrite->op==ASYNC_CLOSE ){
557    async.nFile--;
558  }
559
560  /* The writer thread might have been idle because there was nothing
561  ** on the write-op queue for it to do.  So wake it up. */
562  async_cond_signal(ASYNC_COND_QUEUE);
563
564  /* Drop the queue mutex */
565  if( pWrite->op!=ASYNC_UNLOCK ){
566    async_mutex_leave(ASYNC_MUTEX_QUEUE);
567  }
568}
569
570/*
571** Increment async.nFile in a thread-safe manner.
572*/
573static void incrOpenFileCount(void){
574  /* We must hold the queue mutex in order to modify async.nFile */
575  async_mutex_enter(ASYNC_MUTEX_QUEUE);
576  if( async.nFile==0 ){
577    async.ioError = SQLITE_OK;
578  }
579  async.nFile++;
580  async_mutex_leave(ASYNC_MUTEX_QUEUE);
581}
582
583/*
584** This is a utility function to allocate and populate a new AsyncWrite
585** structure and insert it (via addAsyncWrite() ) into the global list.
586*/
587static int addNewAsyncWrite(
588  AsyncFileData *pFileData,
589  int op,
590  sqlite3_int64 iOffset,
591  int nByte,
592  const char *zByte
593){
594  AsyncWrite *p;
595  if( op!=ASYNC_CLOSE && async.ioError ){
596    return async.ioError;
597  }
598  p = sqlite3_malloc(sizeof(AsyncWrite) + (zByte?nByte:0));
599  if( !p ){
600    /* The upper layer does not expect operations like OsWrite() to
601    ** return SQLITE_NOMEM. This is partly because under normal conditions
602    ** SQLite is required to do rollback without calling malloc(). So
603    ** if malloc() fails here, treat it as an I/O error. The above
604    ** layer knows how to handle that.
605    */
606    return SQLITE_IOERR;
607  }
608  p->op = op;
609  p->iOffset = iOffset;
610  p->nByte = nByte;
611  p->pFileData = pFileData;
612  p->pNext = 0;
613  if( zByte ){
614    p->zBuf = (char *)&p[1];
615    memcpy(p->zBuf, zByte, nByte);
616  }else{
617    p->zBuf = 0;
618  }
619  addAsyncWrite(p);
620  return SQLITE_OK;
621}
622
623/*
624** Close the file. This just adds an entry to the write-op list, the file is
625** not actually closed.
626*/
627static int asyncClose(sqlite3_file *pFile){
628  AsyncFileData *p = ((AsyncFile *)pFile)->pData;
629
630  /* Unlock the file, if it is locked */
631  async_mutex_enter(ASYNC_MUTEX_LOCK);
632  p->lock.eLock = 0;
633  async_mutex_leave(ASYNC_MUTEX_LOCK);
634
635  addAsyncWrite(&p->closeOp);
636  return SQLITE_OK;
637}
638
639/*
640** Implementation of sqlite3OsWrite() for asynchronous files. Instead of
641** writing to the underlying file, this function adds an entry to the end of
642** the global AsyncWrite list. Either SQLITE_OK or SQLITE_NOMEM may be
643** returned.
644*/
645static int asyncWrite(
646  sqlite3_file *pFile,
647  const void *pBuf,
648  int amt,
649  sqlite3_int64 iOff
650){
651  AsyncFileData *p = ((AsyncFile *)pFile)->pData;
652  return addNewAsyncWrite(p, ASYNC_WRITE, iOff, amt, pBuf);
653}
654
655/*
656** Read data from the file. First we read from the filesystem, then adjust
657** the contents of the buffer based on ASYNC_WRITE operations in the
658** write-op queue.
659**
660** This method holds the mutex from start to finish.
661*/
662static int asyncRead(
663  sqlite3_file *pFile,
664  void *zOut,
665  int iAmt,
666  sqlite3_int64 iOffset
667){
668  AsyncFileData *p = ((AsyncFile *)pFile)->pData;
669  int rc = SQLITE_OK;
670  sqlite3_int64 filesize = 0;
671  sqlite3_file *pBase = p->pBaseRead;
672  sqlite3_int64 iAmt64 = (sqlite3_int64)iAmt;
673
674  /* Grab the write queue mutex for the duration of the call */
675  async_mutex_enter(ASYNC_MUTEX_QUEUE);
676
677  /* If an I/O error has previously occurred in this virtual file
678  ** system, then all subsequent operations fail.
679  */
680  if( async.ioError!=SQLITE_OK ){
681    rc = async.ioError;
682    goto asyncread_out;
683  }
684
685  if( pBase->pMethods ){
686    sqlite3_int64 nRead;
687    rc = pBase->pMethods->xFileSize(pBase, &filesize);
688    if( rc!=SQLITE_OK ){
689      goto asyncread_out;
690    }
691    nRead = MIN(filesize - iOffset, iAmt64);
692    if( nRead>0 ){
693      rc = pBase->pMethods->xRead(pBase, zOut, (int)nRead, iOffset);
694      ASYNC_TRACE(("READ %s %d bytes at %d\n", p->zName, nRead, iOffset));
695    }
696  }
697
698  if( rc==SQLITE_OK ){
699    AsyncWrite *pWrite;
700    char *zName = p->zName;
701
702    for(pWrite=async.pQueueFirst; pWrite; pWrite = pWrite->pNext){
703      if( pWrite->op==ASYNC_WRITE && (
704        (pWrite->pFileData==p) ||
705        (zName && pWrite->pFileData->zName==zName)
706      )){
707        sqlite3_int64 nCopy;
708        sqlite3_int64 nByte64 = (sqlite3_int64)pWrite->nByte;
709
710        /* Set variable iBeginIn to the offset in buffer pWrite->zBuf[] from
711        ** which data should be copied. Set iBeginOut to the offset within
712        ** the output buffer to which data should be copied. If either of
713        ** these offsets is a negative number, set them to 0.
714        */
715        sqlite3_int64 iBeginOut = (pWrite->iOffset-iOffset);
716        sqlite3_int64 iBeginIn = -iBeginOut;
717        if( iBeginIn<0 ) iBeginIn = 0;
718        if( iBeginOut<0 ) iBeginOut = 0;
719
720        filesize = MAX(filesize, pWrite->iOffset+nByte64);
721
722        nCopy = MIN(nByte64-iBeginIn, iAmt64-iBeginOut);
723        if( nCopy>0 ){
724          memcpy(&((char *)zOut)[iBeginOut], &pWrite->zBuf[iBeginIn], (size_t)nCopy);
725          ASYNC_TRACE(("OVERREAD %d bytes at %d\n", nCopy, iBeginOut+iOffset));
726        }
727      }
728    }
729  }
730
731asyncread_out:
732  async_mutex_leave(ASYNC_MUTEX_QUEUE);
733  if( rc==SQLITE_OK && filesize<(iOffset+iAmt) ){
734    rc = SQLITE_IOERR_SHORT_READ;
735  }
736  return rc;
737}
738
739/*
740** Truncate the file to nByte bytes in length. This just adds an entry to
741** the write-op list, no IO actually takes place.
742*/
743static int asyncTruncate(sqlite3_file *pFile, sqlite3_int64 nByte){
744  AsyncFileData *p = ((AsyncFile *)pFile)->pData;
745  return addNewAsyncWrite(p, ASYNC_TRUNCATE, nByte, 0, 0);
746}
747
748/*
749** Sync the file. This just adds an entry to the write-op list, the
750** sync() is done later by sqlite3_async_flush().
751*/
752static int asyncSync(sqlite3_file *pFile, int flags){
753  AsyncFileData *p = ((AsyncFile *)pFile)->pData;
754  return addNewAsyncWrite(p, ASYNC_SYNC, 0, flags, 0);
755}
756
757/*
758** Read the size of the file. First we read the size of the file system
759** entry, then adjust for any ASYNC_WRITE or ASYNC_TRUNCATE operations
760** currently in the write-op list.
761**
762** This method holds the mutex from start to finish.
763*/
764int asyncFileSize(sqlite3_file *pFile, sqlite3_int64 *piSize){
765  AsyncFileData *p = ((AsyncFile *)pFile)->pData;
766  int rc = SQLITE_OK;
767  sqlite3_int64 s = 0;
768  sqlite3_file *pBase;
769
770  async_mutex_enter(ASYNC_MUTEX_QUEUE);
771
772  /* Read the filesystem size from the base file. If pMethods is NULL, this
773  ** means the file hasn't been opened yet. In this case all relevant data
774  ** must be in the write-op queue anyway, so we can omit reading from the
775  ** file-system.
776  */
777  pBase = p->pBaseRead;
778  if( pBase->pMethods ){
779    rc = pBase->pMethods->xFileSize(pBase, &s);
780  }
781
782  if( rc==SQLITE_OK ){
783    AsyncWrite *pWrite;
784    for(pWrite=async.pQueueFirst; pWrite; pWrite = pWrite->pNext){
785      if( pWrite->op==ASYNC_DELETE
786       && p->zName
787       && strcmp(p->zName, pWrite->zBuf)==0
788      ){
789        s = 0;
790      }else if( pWrite->pFileData && (
791          (pWrite->pFileData==p)
792       || (p->zName && pWrite->pFileData->zName==p->zName)
793      )){
794        switch( pWrite->op ){
795          case ASYNC_WRITE:
796            s = MAX(pWrite->iOffset + (sqlite3_int64)(pWrite->nByte), s);
797            break;
798          case ASYNC_TRUNCATE:
799            s = MIN(s, pWrite->iOffset);
800            break;
801        }
802      }
803    }
804    *piSize = s;
805  }
806  async_mutex_leave(ASYNC_MUTEX_QUEUE);
807  return rc;
808}
809
810/*
811** Lock or unlock the actual file-system entry.
812*/
813static int getFileLock(AsyncLock *pLock){
814  int rc = SQLITE_OK;
815  AsyncFileLock *pIter;
816  int eRequired = 0;
817
818  if( pLock->pFile ){
819    for(pIter=pLock->pList; pIter; pIter=pIter->pNext){
820      assert(pIter->eAsyncLock>=pIter->eLock);
821      if( pIter->eAsyncLock>eRequired ){
822        eRequired = pIter->eAsyncLock;
823        assert(eRequired>=0 && eRequired<=SQLITE_LOCK_EXCLUSIVE);
824      }
825    }
826
827    if( eRequired>pLock->eLock ){
828      rc = pLock->pFile->pMethods->xLock(pLock->pFile, eRequired);
829      if( rc==SQLITE_OK ){
830        pLock->eLock = eRequired;
831      }
832    }
833    else if( eRequired<pLock->eLock && eRequired<=SQLITE_LOCK_SHARED ){
834      rc = pLock->pFile->pMethods->xUnlock(pLock->pFile, eRequired);
835      if( rc==SQLITE_OK ){
836        pLock->eLock = eRequired;
837      }
838    }
839  }
840
841  return rc;
842}
843
844/*
845** Return the AsyncLock structure from the global async.pLock list
846** associated with the file-system entry identified by path zName
847** (a string of nName bytes). If no such structure exists, return 0.
848*/
849static AsyncLock *findLock(const char *zName, int nName){
850  AsyncLock *p = async.pLock;
851  while( p && (p->nFile!=nName || memcmp(p->zFile, zName, nName)) ){
852    p = p->pNext;
853  }
854  return p;
855}
856
857/*
858** The following two methods - asyncLock() and asyncUnlock() - are used
859** to obtain and release locks on database files opened with the
860** asynchronous backend.
861*/
862static int asyncLock(sqlite3_file *pFile, int eLock){
863  int rc = SQLITE_OK;
864  AsyncFileData *p = ((AsyncFile *)pFile)->pData;
865
866  if( p->zName ){
867    async_mutex_enter(ASYNC_MUTEX_LOCK);
868    if( p->lock.eLock<eLock ){
869      AsyncLock *pLock = p->pLock;
870      AsyncFileLock *pIter;
871      assert(pLock && pLock->pList);
872      for(pIter=pLock->pList; pIter; pIter=pIter->pNext){
873        if( pIter!=&p->lock && (
874          (eLock==SQLITE_LOCK_EXCLUSIVE && pIter->eLock>=SQLITE_LOCK_SHARED) ||
875          (eLock==SQLITE_LOCK_PENDING && pIter->eLock>=SQLITE_LOCK_RESERVED) ||
876          (eLock==SQLITE_LOCK_RESERVED && pIter->eLock>=SQLITE_LOCK_RESERVED) ||
877          (eLock==SQLITE_LOCK_SHARED && pIter->eLock>=SQLITE_LOCK_PENDING)
878        )){
879          rc = SQLITE_BUSY;
880        }
881      }
882      if( rc==SQLITE_OK ){
883        p->lock.eLock = eLock;
884        p->lock.eAsyncLock = MAX(p->lock.eAsyncLock, eLock);
885      }
886      assert(p->lock.eAsyncLock>=p->lock.eLock);
887      if( rc==SQLITE_OK ){
888        rc = getFileLock(pLock);
889      }
890    }
891    async_mutex_leave(ASYNC_MUTEX_LOCK);
892  }
893
894  ASYNC_TRACE(("LOCK %d (%s) rc=%d\n", eLock, p->zName, rc));
895  return rc;
896}
897static int asyncUnlock(sqlite3_file *pFile, int eLock){
898  int rc = SQLITE_OK;
899  AsyncFileData *p = ((AsyncFile *)pFile)->pData;
900  if( p->zName ){
901    AsyncFileLock *pLock = &p->lock;
902    async_mutex_enter(ASYNC_MUTEX_QUEUE);
903    async_mutex_enter(ASYNC_MUTEX_LOCK);
904    pLock->eLock = MIN(pLock->eLock, eLock);
905    rc = addNewAsyncWrite(p, ASYNC_UNLOCK, 0, eLock, 0);
906    async_mutex_leave(ASYNC_MUTEX_LOCK);
907    async_mutex_leave(ASYNC_MUTEX_QUEUE);
908  }
909  return rc;
910}
911
912/*
913** This function is called when the pager layer first opens a database file
914** and is checking for a hot-journal.
915*/
916static int asyncCheckReservedLock(sqlite3_file *pFile, int *pResOut){
917  int ret = 0;
918  AsyncFileLock *pIter;
919  AsyncFileData *p = ((AsyncFile *)pFile)->pData;
920
921  async_mutex_enter(ASYNC_MUTEX_LOCK);
922  for(pIter=p->pLock->pList; pIter; pIter=pIter->pNext){
923    if( pIter->eLock>=SQLITE_LOCK_RESERVED ){
924      ret = 1;
925      break;
926    }
927  }
928  async_mutex_leave(ASYNC_MUTEX_LOCK);
929
930  ASYNC_TRACE(("CHECK-LOCK %d (%s)\n", ret, p->zName));
931  *pResOut = ret;
932  return SQLITE_OK;
933}
934
935/*
936** sqlite3_file_control() implementation.
937*/
938static int asyncFileControl(sqlite3_file *id, int op, void *pArg){
939  switch( op ){
940    case SQLITE_FCNTL_LOCKSTATE: {
941      async_mutex_enter(ASYNC_MUTEX_LOCK);
942      *(int*)pArg = ((AsyncFile*)id)->pData->lock.eLock;
943      async_mutex_leave(ASYNC_MUTEX_LOCK);
944      return SQLITE_OK;
945    }
946  }
947  return SQLITE_ERROR;
948}
949
950/*
951** Return the device characteristics and sector-size of the device. It
952** is tricky to implement these correctly, as this backend might
953** not have an open file handle at this point.
954*/
955static int asyncSectorSize(sqlite3_file *pFile){
956  UNUSED_PARAMETER(pFile);
957  return 512;
958}
959static int asyncDeviceCharacteristics(sqlite3_file *pFile){
960  UNUSED_PARAMETER(pFile);
961  return 0;
962}
963
964static int unlinkAsyncFile(AsyncFileData *pData){
965  AsyncFileLock **ppIter;
966  int rc = SQLITE_OK;
967
968  if( pData->zName ){
969    AsyncLock *pLock = pData->pLock;
970    for(ppIter=&pLock->pList; *ppIter; ppIter=&((*ppIter)->pNext)){
971      if( (*ppIter)==&pData->lock ){
972        *ppIter = pData->lock.pNext;
973        break;
974      }
975    }
976    if( !pLock->pList ){
977      AsyncLock **pp;
978      if( pLock->pFile ){
979        pLock->pFile->pMethods->xClose(pLock->pFile);
980      }
981      for(pp=&async.pLock; *pp!=pLock; pp=&((*pp)->pNext));
982      *pp = pLock->pNext;
983      sqlite3_free(pLock);
984    }else{
985      rc = getFileLock(pLock);
986    }
987  }
988
989  return rc;
990}
991
992/*
993** The parameter passed to this function is a copy of a 'flags' parameter
994** passed to this modules xOpen() method. This function returns true
995** if the file should be opened asynchronously, or false if it should
996** be opened immediately.
997**
998** If the file is to be opened asynchronously, then asyncOpen() will add
999** an entry to the event queue and the file will not actually be opened
1000** until the event is processed. Otherwise, the file is opened directly
1001** by the caller.
1002*/
1003static int doAsynchronousOpen(int flags){
1004  return (flags&SQLITE_OPEN_CREATE) && (
1005      (flags&SQLITE_OPEN_MAIN_JOURNAL) ||
1006      (flags&SQLITE_OPEN_TEMP_JOURNAL) ||
1007      (flags&SQLITE_OPEN_DELETEONCLOSE)
1008  );
1009}
1010
1011/*
1012** Open a file.
1013*/
1014static int asyncOpen(
1015  sqlite3_vfs *pAsyncVfs,
1016  const char *zName,
1017  sqlite3_file *pFile,
1018  int flags,
1019  int *pOutFlags
1020){
1021  static sqlite3_io_methods async_methods = {
1022    1,                               /* iVersion */
1023    asyncClose,                      /* xClose */
1024    asyncRead,                       /* xRead */
1025    asyncWrite,                      /* xWrite */
1026    asyncTruncate,                   /* xTruncate */
1027    asyncSync,                       /* xSync */
1028    asyncFileSize,                   /* xFileSize */
1029    asyncLock,                       /* xLock */
1030    asyncUnlock,                     /* xUnlock */
1031    asyncCheckReservedLock,          /* xCheckReservedLock */
1032    asyncFileControl,                /* xFileControl */
1033    asyncSectorSize,                 /* xSectorSize */
1034    asyncDeviceCharacteristics       /* xDeviceCharacteristics */
1035  };
1036
1037  sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1038  AsyncFile *p = (AsyncFile *)pFile;
1039  int nName = 0;
1040  int rc = SQLITE_OK;
1041  int nByte;
1042  AsyncFileData *pData;
1043  AsyncLock *pLock = 0;
1044  char *z;
1045  int isAsyncOpen = doAsynchronousOpen(flags);
1046
1047  /* If zName is NULL, then the upper layer is requesting an anonymous file */
1048  if( zName ){
1049    nName = (int)strlen(zName)+1;
1050  }
1051
1052  nByte = (
1053    sizeof(AsyncFileData) +        /* AsyncFileData structure */
1054    2 * pVfs->szOsFile +           /* AsyncFileData.pBaseRead and pBaseWrite */
1055    nName                          /* AsyncFileData.zName */
1056  );
1057  z = sqlite3_malloc(nByte);
1058  if( !z ){
1059    return SQLITE_NOMEM;
1060  }
1061  memset(z, 0, nByte);
1062  pData = (AsyncFileData*)z;
1063  z += sizeof(pData[0]);
1064  pData->pBaseRead = (sqlite3_file*)z;
1065  z += pVfs->szOsFile;
1066  pData->pBaseWrite = (sqlite3_file*)z;
1067  pData->closeOp.pFileData = pData;
1068  pData->closeOp.op = ASYNC_CLOSE;
1069
1070  if( zName ){
1071    z += pVfs->szOsFile;
1072    pData->zName = z;
1073    pData->nName = nName;
1074    memcpy(pData->zName, zName, nName);
1075  }
1076
1077  if( !isAsyncOpen ){
1078    int flagsout;
1079    rc = pVfs->xOpen(pVfs, pData->zName, pData->pBaseRead, flags, &flagsout);
1080    if( rc==SQLITE_OK
1081     && (flagsout&SQLITE_OPEN_READWRITE)
1082     && (flags&SQLITE_OPEN_EXCLUSIVE)==0
1083    ){
1084      rc = pVfs->xOpen(pVfs, pData->zName, pData->pBaseWrite, flags, 0);
1085    }
1086    if( pOutFlags ){
1087      *pOutFlags = flagsout;
1088    }
1089  }
1090
1091  async_mutex_enter(ASYNC_MUTEX_LOCK);
1092
1093  if( zName && rc==SQLITE_OK ){
1094    pLock = findLock(pData->zName, pData->nName);
1095    if( !pLock ){
1096      int nByte = pVfs->szOsFile + sizeof(AsyncLock) + pData->nName + 1;
1097      pLock = (AsyncLock *)sqlite3_malloc(nByte);
1098      if( pLock ){
1099        memset(pLock, 0, nByte);
1100        if( async.bLockFiles && (flags&SQLITE_OPEN_MAIN_DB) ){
1101          pLock->pFile = (sqlite3_file *)&pLock[1];
1102          rc = pVfs->xOpen(pVfs, pData->zName, pLock->pFile, flags, 0);
1103          if( rc!=SQLITE_OK ){
1104            sqlite3_free(pLock);
1105            pLock = 0;
1106          }
1107        }
1108        if( pLock ){
1109          pLock->nFile = pData->nName;
1110          pLock->zFile = &((char *)(&pLock[1]))[pVfs->szOsFile];
1111          memcpy(pLock->zFile, pData->zName, pLock->nFile);
1112          pLock->pNext = async.pLock;
1113          async.pLock = pLock;
1114        }
1115      }else{
1116        rc = SQLITE_NOMEM;
1117      }
1118    }
1119  }
1120
1121  if( rc==SQLITE_OK ){
1122    p->pMethod = &async_methods;
1123    p->pData = pData;
1124
1125    /* Link AsyncFileData.lock into the linked list of
1126    ** AsyncFileLock structures for this file.
1127    */
1128    if( zName ){
1129      pData->lock.pNext = pLock->pList;
1130      pLock->pList = &pData->lock;
1131      pData->zName = pLock->zFile;
1132    }
1133  }else{
1134    if( pData->pBaseRead->pMethods ){
1135      pData->pBaseRead->pMethods->xClose(pData->pBaseRead);
1136    }
1137    if( pData->pBaseWrite->pMethods ){
1138      pData->pBaseWrite->pMethods->xClose(pData->pBaseWrite);
1139    }
1140    sqlite3_free(pData);
1141  }
1142
1143  async_mutex_leave(ASYNC_MUTEX_LOCK);
1144
1145  if( rc==SQLITE_OK ){
1146    pData->pLock = pLock;
1147  }
1148
1149  if( rc==SQLITE_OK && isAsyncOpen ){
1150    rc = addNewAsyncWrite(pData, ASYNC_OPENEXCLUSIVE, (sqlite3_int64)flags,0,0);
1151    if( rc==SQLITE_OK ){
1152      if( pOutFlags ) *pOutFlags = flags;
1153    }else{
1154      async_mutex_enter(ASYNC_MUTEX_LOCK);
1155      unlinkAsyncFile(pData);
1156      async_mutex_leave(ASYNC_MUTEX_LOCK);
1157      sqlite3_free(pData);
1158    }
1159  }
1160  if( rc!=SQLITE_OK ){
1161    p->pMethod = 0;
1162  }else{
1163    incrOpenFileCount();
1164  }
1165
1166  return rc;
1167}
1168
1169/*
1170** Implementation of sqlite3OsDelete. Add an entry to the end of the
1171** write-op queue to perform the delete.
1172*/
1173static int asyncDelete(sqlite3_vfs *pAsyncVfs, const char *z, int syncDir){
1174  UNUSED_PARAMETER(pAsyncVfs);
1175  return addNewAsyncWrite(0, ASYNC_DELETE, syncDir, (int)strlen(z)+1, z);
1176}
1177
1178/*
1179** Implementation of sqlite3OsAccess. This method holds the mutex from
1180** start to finish.
1181*/
1182static int asyncAccess(
1183  sqlite3_vfs *pAsyncVfs,
1184  const char *zName,
1185  int flags,
1186  int *pResOut
1187){
1188  int rc;
1189  int ret;
1190  AsyncWrite *p;
1191  sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1192
1193  assert(flags==SQLITE_ACCESS_READWRITE
1194      || flags==SQLITE_ACCESS_READ
1195      || flags==SQLITE_ACCESS_EXISTS
1196  );
1197
1198  async_mutex_enter(ASYNC_MUTEX_QUEUE);
1199  rc = pVfs->xAccess(pVfs, zName, flags, &ret);
1200  if( rc==SQLITE_OK && flags==SQLITE_ACCESS_EXISTS ){
1201    for(p=async.pQueueFirst; p; p = p->pNext){
1202      if( p->op==ASYNC_DELETE && 0==strcmp(p->zBuf, zName) ){
1203        ret = 0;
1204      }else if( p->op==ASYNC_OPENEXCLUSIVE
1205             && p->pFileData->zName
1206             && 0==strcmp(p->pFileData->zName, zName)
1207      ){
1208        ret = 1;
1209      }
1210    }
1211  }
1212  ASYNC_TRACE(("ACCESS(%s): %s = %d\n",
1213    flags==SQLITE_ACCESS_READWRITE?"read-write":
1214    flags==SQLITE_ACCESS_READ?"read":"exists"
1215    , zName, ret)
1216  );
1217  async_mutex_leave(ASYNC_MUTEX_QUEUE);
1218  *pResOut = ret;
1219  return rc;
1220}
1221
1222/*
1223** Fill in zPathOut with the full path to the file identified by zPath.
1224*/
1225static int asyncFullPathname(
1226  sqlite3_vfs *pAsyncVfs,
1227  const char *zPath,
1228  int nPathOut,
1229  char *zPathOut
1230){
1231  int rc;
1232  sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1233  rc = pVfs->xFullPathname(pVfs, zPath, nPathOut, zPathOut);
1234
1235  /* Because of the way intra-process file locking works, this backend
1236  ** needs to return a canonical path. The following block assumes the
1237  ** file-system uses unix style paths.
1238  */
1239  if( rc==SQLITE_OK ){
1240    int i, j;
1241    char *z = zPathOut;
1242    int n = (int)strlen(z);
1243    while( n>1 && z[n-1]=='/' ){ n--; }
1244    for(i=j=0; i<n; i++){
1245      if( z[i]=='/' ){
1246        if( z[i+1]=='/' ) continue;
1247        if( z[i+1]=='.' && i+2<n && z[i+2]=='/' ){
1248          i += 1;
1249          continue;
1250        }
1251        if( z[i+1]=='.' && i+3<n && z[i+2]=='.' && z[i+3]=='/' ){
1252          while( j>0 && z[j-1]!='/' ){ j--; }
1253          if( j>0 ){ j--; }
1254          i += 2;
1255          continue;
1256        }
1257      }
1258      z[j++] = z[i];
1259    }
1260    z[j] = 0;
1261  }
1262
1263  return rc;
1264}
1265static void *asyncDlOpen(sqlite3_vfs *pAsyncVfs, const char *zPath){
1266  sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1267  return pVfs->xDlOpen(pVfs, zPath);
1268}
1269static void asyncDlError(sqlite3_vfs *pAsyncVfs, int nByte, char *zErrMsg){
1270  sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1271  pVfs->xDlError(pVfs, nByte, zErrMsg);
1272}
1273static void (*asyncDlSym(
1274  sqlite3_vfs *pAsyncVfs,
1275  void *pHandle,
1276  const char *zSymbol
1277))(void){
1278  sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1279  return pVfs->xDlSym(pVfs, pHandle, zSymbol);
1280}
1281static void asyncDlClose(sqlite3_vfs *pAsyncVfs, void *pHandle){
1282  sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1283  pVfs->xDlClose(pVfs, pHandle);
1284}
1285static int asyncRandomness(sqlite3_vfs *pAsyncVfs, int nByte, char *zBufOut){
1286  sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1287  return pVfs->xRandomness(pVfs, nByte, zBufOut);
1288}
1289static int asyncSleep(sqlite3_vfs *pAsyncVfs, int nMicro){
1290  sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1291  return pVfs->xSleep(pVfs, nMicro);
1292}
1293static int asyncCurrentTime(sqlite3_vfs *pAsyncVfs, double *pTimeOut){
1294  sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1295  return pVfs->xCurrentTime(pVfs, pTimeOut);
1296}
1297
1298static sqlite3_vfs async_vfs = {
1299  1,                    /* iVersion */
1300  sizeof(AsyncFile),    /* szOsFile */
1301  0,                    /* mxPathname */
1302  0,                    /* pNext */
1303  SQLITEASYNC_VFSNAME,  /* zName */
1304  0,                    /* pAppData */
1305  asyncOpen,            /* xOpen */
1306  asyncDelete,          /* xDelete */
1307  asyncAccess,          /* xAccess */
1308  asyncFullPathname,    /* xFullPathname */
1309  asyncDlOpen,          /* xDlOpen */
1310  asyncDlError,         /* xDlError */
1311  asyncDlSym,           /* xDlSym */
1312  asyncDlClose,         /* xDlClose */
1313  asyncRandomness,      /* xDlError */
1314  asyncSleep,           /* xDlSym */
1315  asyncCurrentTime      /* xDlClose */
1316};
1317
1318/*
1319** This procedure runs in a separate thread, reading messages off of the
1320** write queue and processing them one by one.
1321**
1322** If async.writerHaltNow is true, then this procedure exits
1323** after processing a single message.
1324**
1325** If async.writerHaltWhenIdle is true, then this procedure exits when
1326** the write queue is empty.
1327**
1328** If both of the above variables are false, this procedure runs
1329** indefinately, waiting for operations to be added to the write queue
1330** and processing them in the order in which they arrive.
1331**
1332** An artifical delay of async.ioDelay milliseconds is inserted before
1333** each write operation in order to simulate the effect of a slow disk.
1334**
1335** Only one instance of this procedure may be running at a time.
1336*/
1337static void asyncWriterThread(void){
1338  sqlite3_vfs *pVfs = (sqlite3_vfs *)(async_vfs.pAppData);
1339  AsyncWrite *p = 0;
1340  int rc = SQLITE_OK;
1341  int holdingMutex = 0;
1342
1343  async_mutex_enter(ASYNC_MUTEX_WRITER);
1344
1345  while( async.eHalt!=SQLITEASYNC_HALT_NOW ){
1346    int doNotFree = 0;
1347    sqlite3_file *pBase = 0;
1348
1349    if( !holdingMutex ){
1350      async_mutex_enter(ASYNC_MUTEX_QUEUE);
1351    }
1352    while( (p = async.pQueueFirst)==0 ){
1353      if( async.eHalt!=SQLITEASYNC_HALT_NEVER ){
1354        async_mutex_leave(ASYNC_MUTEX_QUEUE);
1355        break;
1356      }else{
1357        ASYNC_TRACE(("IDLE\n"));
1358        async_cond_wait(ASYNC_COND_QUEUE, ASYNC_MUTEX_QUEUE);
1359        ASYNC_TRACE(("WAKEUP\n"));
1360      }
1361    }
1362    if( p==0 ) break;
1363    holdingMutex = 1;
1364
1365    /* Right now this thread is holding the mutex on the write-op queue.
1366    ** Variable 'p' points to the first entry in the write-op queue. In
1367    ** the general case, we hold on to the mutex for the entire body of
1368    ** the loop.
1369    **
1370    ** However in the cases enumerated below, we relinquish the mutex,
1371    ** perform the IO, and then re-request the mutex before removing 'p' from
1372    ** the head of the write-op queue. The idea is to increase concurrency with
1373    ** sqlite threads.
1374    **
1375    **     * An ASYNC_CLOSE operation.
1376    **     * An ASYNC_OPENEXCLUSIVE operation. For this one, we relinquish
1377    **       the mutex, call the underlying xOpenExclusive() function, then
1378    **       re-aquire the mutex before seting the AsyncFile.pBaseRead
1379    **       variable.
1380    **     * ASYNC_SYNC and ASYNC_WRITE operations, if
1381    **       SQLITE_ASYNC_TWO_FILEHANDLES was set at compile time and two
1382    **       file-handles are open for the particular file being "synced".
1383    */
1384    if( async.ioError!=SQLITE_OK && p->op!=ASYNC_CLOSE ){
1385      p->op = ASYNC_NOOP;
1386    }
1387    if( p->pFileData ){
1388      pBase = p->pFileData->pBaseWrite;
1389      if(
1390        p->op==ASYNC_CLOSE ||
1391        p->op==ASYNC_OPENEXCLUSIVE ||
1392        (pBase->pMethods && (p->op==ASYNC_SYNC || p->op==ASYNC_WRITE) )
1393      ){
1394        async_mutex_leave(ASYNC_MUTEX_QUEUE);
1395        holdingMutex = 0;
1396      }
1397      if( !pBase->pMethods ){
1398        pBase = p->pFileData->pBaseRead;
1399      }
1400    }
1401
1402    switch( p->op ){
1403      case ASYNC_NOOP:
1404        break;
1405
1406      case ASYNC_WRITE:
1407        assert( pBase );
1408        ASYNC_TRACE(("WRITE %s %d bytes at %d\n",
1409                p->pFileData->zName, p->nByte, p->iOffset));
1410        rc = pBase->pMethods->xWrite(pBase, (void *)(p->zBuf), p->nByte, p->iOffset);
1411        break;
1412
1413      case ASYNC_SYNC:
1414        assert( pBase );
1415        ASYNC_TRACE(("SYNC %s\n", p->pFileData->zName));
1416        rc = pBase->pMethods->xSync(pBase, p->nByte);
1417        break;
1418
1419      case ASYNC_TRUNCATE:
1420        assert( pBase );
1421        ASYNC_TRACE(("TRUNCATE %s to %d bytes\n",
1422                p->pFileData->zName, p->iOffset));
1423        rc = pBase->pMethods->xTruncate(pBase, p->iOffset);
1424        break;
1425
1426      case ASYNC_CLOSE: {
1427        AsyncFileData *pData = p->pFileData;
1428        ASYNC_TRACE(("CLOSE %s\n", p->pFileData->zName));
1429        if( pData->pBaseWrite->pMethods ){
1430          pData->pBaseWrite->pMethods->xClose(pData->pBaseWrite);
1431        }
1432        if( pData->pBaseRead->pMethods ){
1433          pData->pBaseRead->pMethods->xClose(pData->pBaseRead);
1434        }
1435
1436        /* Unlink AsyncFileData.lock from the linked list of AsyncFileLock
1437        ** structures for this file. Obtain the async.lockMutex mutex
1438        ** before doing so.
1439        */
1440        async_mutex_enter(ASYNC_MUTEX_LOCK);
1441        rc = unlinkAsyncFile(pData);
1442        async_mutex_leave(ASYNC_MUTEX_LOCK);
1443
1444        if( !holdingMutex ){
1445          async_mutex_enter(ASYNC_MUTEX_QUEUE);
1446          holdingMutex = 1;
1447        }
1448        assert_mutex_is_held(ASYNC_MUTEX_QUEUE);
1449        async.pQueueFirst = p->pNext;
1450        sqlite3_free(pData);
1451        doNotFree = 1;
1452        break;
1453      }
1454
1455      case ASYNC_UNLOCK: {
1456        AsyncWrite *pIter;
1457        AsyncFileData *pData = p->pFileData;
1458        int eLock = p->nByte;
1459
1460        /* When a file is locked by SQLite using the async backend, it is
1461        ** locked within the 'real' file-system synchronously. When it is
1462        ** unlocked, an ASYNC_UNLOCK event is added to the write-queue to
1463        ** unlock the file asynchronously. The design of the async backend
1464        ** requires that the 'real' file-system file be locked from the
1465        ** time that SQLite first locks it (and probably reads from it)
1466        ** until all asynchronous write events that were scheduled before
1467        ** SQLite unlocked the file have been processed.
1468        **
1469        ** This is more complex if SQLite locks and unlocks the file multiple
1470        ** times in quick succession. For example, if SQLite does:
1471        **
1472        **   lock, write, unlock, lock, write, unlock
1473        **
1474        ** Each "lock" operation locks the file immediately. Each "write"
1475        ** and "unlock" operation adds an event to the event queue. If the
1476        ** second "lock" operation is performed before the first "unlock"
1477        ** operation has been processed asynchronously, then the first
1478        ** "unlock" cannot be safely processed as is, since this would mean
1479        ** the file was unlocked when the second "write" operation is
1480        ** processed. To work around this, when processing an ASYNC_UNLOCK
1481        ** operation, SQLite:
1482        **
1483        **   1) Unlocks the file to the minimum of the argument passed to
1484        **      the xUnlock() call and the current lock from SQLite's point
1485        **      of view, and
1486        **
1487        **   2) Only unlocks the file at all if this event is the last
1488        **      ASYNC_UNLOCK event on this file in the write-queue.
1489        */
1490        assert( holdingMutex==1 );
1491        assert( async.pQueueFirst==p );
1492        for(pIter=async.pQueueFirst->pNext; pIter; pIter=pIter->pNext){
1493          if( pIter->pFileData==pData && pIter->op==ASYNC_UNLOCK ) break;
1494        }
1495        if( !pIter ){
1496          async_mutex_enter(ASYNC_MUTEX_LOCK);
1497          pData->lock.eAsyncLock = MIN(
1498              pData->lock.eAsyncLock, MAX(pData->lock.eLock, eLock)
1499          );
1500          assert(pData->lock.eAsyncLock>=pData->lock.eLock);
1501          rc = getFileLock(pData->pLock);
1502          async_mutex_leave(ASYNC_MUTEX_LOCK);
1503        }
1504        break;
1505      }
1506
1507      case ASYNC_DELETE:
1508        ASYNC_TRACE(("DELETE %s\n", p->zBuf));
1509        rc = pVfs->xDelete(pVfs, p->zBuf, (int)p->iOffset);
1510        break;
1511
1512      case ASYNC_OPENEXCLUSIVE: {
1513        int flags = (int)p->iOffset;
1514        AsyncFileData *pData = p->pFileData;
1515        ASYNC_TRACE(("OPEN %s flags=%d\n", p->zBuf, (int)p->iOffset));
1516        assert(pData->pBaseRead->pMethods==0 && pData->pBaseWrite->pMethods==0);
1517        rc = pVfs->xOpen(pVfs, pData->zName, pData->pBaseRead, flags, 0);
1518        assert( holdingMutex==0 );
1519        async_mutex_enter(ASYNC_MUTEX_QUEUE);
1520        holdingMutex = 1;
1521        break;
1522      }
1523
1524      default: assert(!"Illegal value for AsyncWrite.op");
1525    }
1526
1527    /* If we didn't hang on to the mutex during the IO op, obtain it now
1528    ** so that the AsyncWrite structure can be safely removed from the
1529    ** global write-op queue.
1530    */
1531    if( !holdingMutex ){
1532      async_mutex_enter(ASYNC_MUTEX_QUEUE);
1533      holdingMutex = 1;
1534    }
1535    /* ASYNC_TRACE(("UNLINK %p\n", p)); */
1536    if( p==async.pQueueLast ){
1537      async.pQueueLast = 0;
1538    }
1539    if( !doNotFree ){
1540      assert_mutex_is_held(ASYNC_MUTEX_QUEUE);
1541      async.pQueueFirst = p->pNext;
1542      sqlite3_free(p);
1543    }
1544    assert( holdingMutex );
1545
1546    /* An IO error has occurred. We cannot report the error back to the
1547    ** connection that requested the I/O since the error happened
1548    ** asynchronously.  The connection has already moved on.  There
1549    ** really is nobody to report the error to.
1550    **
1551    ** The file for which the error occurred may have been a database or
1552    ** journal file. Regardless, none of the currently queued operations
1553    ** associated with the same database should now be performed. Nor should
1554    ** any subsequently requested IO on either a database or journal file
1555    ** handle for the same database be accepted until the main database
1556    ** file handle has been closed and reopened.
1557    **
1558    ** Furthermore, no further IO should be queued or performed on any file
1559    ** handle associated with a database that may have been part of a
1560    ** multi-file transaction that included the database associated with
1561    ** the IO error (i.e. a database ATTACHed to the same handle at some
1562    ** point in time).
1563    */
1564    if( rc!=SQLITE_OK ){
1565      async.ioError = rc;
1566    }
1567
1568    if( async.ioError && !async.pQueueFirst ){
1569      async_mutex_enter(ASYNC_MUTEX_LOCK);
1570      if( 0==async.pLock ){
1571        async.ioError = SQLITE_OK;
1572      }
1573      async_mutex_leave(ASYNC_MUTEX_LOCK);
1574    }
1575
1576    /* Drop the queue mutex before continuing to the next write operation
1577    ** in order to give other threads a chance to work with the write queue.
1578    */
1579    if( !async.pQueueFirst || !async.ioError ){
1580      async_mutex_leave(ASYNC_MUTEX_QUEUE);
1581      holdingMutex = 0;
1582      if( async.ioDelay>0 ){
1583        pVfs->xSleep(pVfs, async.ioDelay*1000);
1584      }else{
1585        async_sched_yield();
1586      }
1587    }
1588  }
1589
1590  async_mutex_leave(ASYNC_MUTEX_WRITER);
1591  return;
1592}
1593
1594/*
1595** Install the asynchronous VFS.
1596*/
1597int sqlite3async_initialize(const char *zParent, int isDefault){
1598  int rc = SQLITE_OK;
1599  if( async_vfs.pAppData==0 ){
1600    sqlite3_vfs *pParent = sqlite3_vfs_find(zParent);
1601    if( !pParent || async_os_initialize() ){
1602      rc = SQLITE_ERROR;
1603    }else if( SQLITE_OK!=(rc = sqlite3_vfs_register(&async_vfs, isDefault)) ){
1604      async_os_shutdown();
1605    }else{
1606      async_vfs.pAppData = (void *)pParent;
1607      async_vfs.mxPathname = ((sqlite3_vfs *)async_vfs.pAppData)->mxPathname;
1608    }
1609  }
1610  return rc;
1611}
1612
1613/*
1614** Uninstall the asynchronous VFS.
1615*/
1616void sqlite3async_shutdown(void){
1617  if( async_vfs.pAppData ){
1618    async_os_shutdown();
1619    sqlite3_vfs_unregister((sqlite3_vfs *)&async_vfs);
1620    async_vfs.pAppData = 0;
1621  }
1622}
1623
1624/*
1625** Process events on the write-queue.
1626*/
1627void sqlite3async_run(void){
1628  asyncWriterThread();
1629}
1630
1631/*
1632** Control/configure the asynchronous IO system.
1633*/
1634int sqlite3async_control(int op, ...){
1635  va_list ap;
1636  va_start(ap, op);
1637  switch( op ){
1638    case SQLITEASYNC_HALT: {
1639      int eWhen = va_arg(ap, int);
1640      if( eWhen!=SQLITEASYNC_HALT_NEVER
1641       && eWhen!=SQLITEASYNC_HALT_NOW
1642       && eWhen!=SQLITEASYNC_HALT_IDLE
1643      ){
1644        return SQLITE_MISUSE;
1645      }
1646      async.eHalt = eWhen;
1647      async_mutex_enter(ASYNC_MUTEX_QUEUE);
1648      async_cond_signal(ASYNC_COND_QUEUE);
1649      async_mutex_leave(ASYNC_MUTEX_QUEUE);
1650      break;
1651    }
1652
1653    case SQLITEASYNC_DELAY: {
1654      int iDelay = va_arg(ap, int);
1655      if( iDelay<0 ){
1656        return SQLITE_MISUSE;
1657      }
1658      async.ioDelay = iDelay;
1659      break;
1660    }
1661
1662    case SQLITEASYNC_LOCKFILES: {
1663      int bLock = va_arg(ap, int);
1664      async_mutex_enter(ASYNC_MUTEX_QUEUE);
1665      if( async.nFile || async.pQueueFirst ){
1666        async_mutex_leave(ASYNC_MUTEX_QUEUE);
1667        return SQLITE_MISUSE;
1668      }
1669      async.bLockFiles = bLock;
1670      async_mutex_leave(ASYNC_MUTEX_QUEUE);
1671      break;
1672    }
1673
1674    case SQLITEASYNC_GET_HALT: {
1675      int *peWhen = va_arg(ap, int *);
1676      *peWhen = async.eHalt;
1677      break;
1678    }
1679    case SQLITEASYNC_GET_DELAY: {
1680      int *piDelay = va_arg(ap, int *);
1681      *piDelay = async.ioDelay;
1682      break;
1683    }
1684    case SQLITEASYNC_GET_LOCKFILES: {
1685      int *piDelay = va_arg(ap, int *);
1686      *piDelay = async.bLockFiles;
1687      break;
1688    }
1689
1690    default:
1691      return SQLITE_ERROR;
1692  }
1693  return SQLITE_OK;
1694}
1695
1696#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_ASYNCIO) */
1697
1698