diff --git a/fil/fil0fil.c b/fil/fil0fil.c index 9ed6b62f142..107a81b85a8 100644 --- a/fil/fil0fil.c +++ b/fil/fil0fil.c @@ -4436,11 +4436,14 @@ fil_aio_wait( ut_ad(fil_validate()); - if (os_aio_use_native_aio) { + if (srv_use_native_aio) { srv_set_io_thread_op_info(segment, "native aio handle"); #ifdef WIN_ASYNC_IO ret = os_aio_windows_handle(segment, 0, &fil_node, &message, &type); +#elif defined(LINUX_NATIVE_AIO) + ret = os_aio_linux_handle(segment, &fil_node, + &message, &type); #else ret = 0; /* Eliminate compiler warning */ ut_error; diff --git a/handler/ha_innodb.cc b/handler/ha_innodb.cc index 35bc204dddf..c9eb5e99d8b 100644 --- a/handler/ha_innodb.cc +++ b/handler/ha_innodb.cc @@ -9573,6 +9573,11 @@ static MYSQL_SYSVAR_STR(version, innodb_version_str, PLUGIN_VAR_NOCMDOPT | PLUGIN_VAR_READONLY, "InnoDB version", NULL, NULL, INNODB_VERSION_STR); +static MYSQL_SYSVAR_BOOL(use_native_aio, srv_use_native_aio, + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, + "Use native AIO if supported on this platform.", + NULL, NULL, TRUE); + static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(additional_mem_pool_size), MYSQL_SYSVAR(autoextend_increment), @@ -9619,6 +9624,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(thread_sleep_delay), MYSQL_SYSVAR(autoinc_lock_mode), MYSQL_SYSVAR(version), + MYSQL_SYSVAR(use_native_aio), NULL }; diff --git a/include/os0file.h b/include/os0file.h index 67d31dd04e9..339945752df 100644 --- a/include/os0file.h +++ b/include/os0file.h @@ -51,12 +51,6 @@ typedef int os_file_t; extern ulint os_innodb_umask; -/* If this flag is TRUE, then we will use the native aio of the -OS (provided we compiled Innobase with it in), otherwise we will -use simulated aio we build below with threads */ - -extern ibool os_aio_use_native_aio; - #define OS_FILE_SECTOR_SIZE 512 /* The next value should be smaller or equal to the smallest sector size used @@ -98,6 +92,7 @@ log. */ to become available again */ #define OS_FILE_SHARING_VIOLATION 76 #define OS_FILE_ERROR_NOT_SPECIFIED 77 +#define OS_FILE_AIO_INTERRUPTED 78 /* Types for aio operations */ #define OS_FILE_READ 10 @@ -556,9 +551,10 @@ in the three first aio arrays is the parameter n_segments given to the function. The caller must create an i/o handler thread for each segment in the four first arrays, but not for the sync aio array. */ UNIV_INTERN -void +ibool os_aio_init( /*========*/ + /* out: TRUE on success. */ ulint n, /* in: maximum number of pending aio operations allowed; n must be divisible by n_segments */ ulint n_segments, /* in: combined number of segments in the four @@ -737,4 +733,32 @@ innobase_mysql_tmpfile(void); /* out: temporary file descriptor, or < 0 on error */ #endif /* !UNIV_HOTBACKUP && !__NETWARE__ */ + +#if defined(LINUX_NATIVE_AIO) +/************************************************************************** +This function is only used in Linux native asynchronous i/o. +Waits for an aio operation to complete. This function is used to wait the +for completed requests. The aio array of pending requests is divided +into segments. The thread specifies which segment or slot it wants to wait +for. NOTE: this function will also take care of freeing the aio slot, +therefore no other thread is allowed to do the freeing! */ +UNIV_INTERN +ibool +os_aio_linux_handle( +/*================*/ + /* out: TRUE if the IO was successful */ + ulint global_seg, /* in: segment number in the aio array + to wait for; segment 0 is the ibuf + i/o thread, segment 1 is log i/o thread, + then follow the non-ibuf read threads, + and the last are the non-ibuf write + threads. */ + fil_node_t**message1, /* out: the messages passed with the */ + void** message2, /* aio request; note that in case the + aio operation failed, these output + parameters are valid and can be used to + restart the operation. */ + ulint* type); /* out: OS_FILE_WRITE or ..._READ */ +#endif /* LINUX_NATIVE_AIO */ + #endif diff --git a/include/srv0srv.h b/include/srv0srv.h index 878afa0feb3..ff9caefd989 100644 --- a/include/srv0srv.h +++ b/include/srv0srv.h @@ -68,6 +68,11 @@ extern ulint srv_check_file_format_at_startup; on duplicate key checking and foreign key checking */ extern ibool srv_locks_unsafe_for_binlog; +/* If this flag is TRUE, then we will use the native aio of the +OS (provided we compiled Innobase with it in), otherwise we will +use simulated aio we build below with threads. +Currently we support native aio on windows and linux */ +extern my_bool srv_use_native_aio; extern ulint srv_n_data_files; extern char** srv_data_file_names; extern ulint* srv_data_file_sizes; diff --git a/include/univ.i b/include/univ.i index d2fee9c9832..f879b235c2c 100644 --- a/include/univ.i +++ b/include/univ.i @@ -162,6 +162,9 @@ operations (very slow); also UNIV_DEBUG must be defined */ for compressed pages */ #define UNIV_ZIP_COPY /* call page_zip_copy_recs() more often */ +#define UNIV_AIO_DEBUG /* prints info about + submitted and reaped AIO + requests to the log. */ #endif #define UNIV_BTR_DEBUG /* check B-tree links */ diff --git a/os/os0file.c b/os/os0file.c index 9eef834edf7..890fa7f36a6 100644 --- a/os/os0file.c +++ b/os/os0file.c @@ -22,6 +22,10 @@ Created 10/21/1995 Heikki Tuuri #include #endif /* UNIV_HOTBACKUP */ +#if defined(LINUX_NATIVE_AIO) +#include +#endif + /* This specifies the file permissions InnoDB uses when it creates files in Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to my_umask */ @@ -49,11 +53,59 @@ UNIV_INTERN os_mutex_t os_file_seek_mutexes[OS_FILE_N_SEEK_MUTEXES]; /* In simulated aio, merge at most this many consecutive i/os */ #define OS_AIO_MERGE_N_CONSECUTIVE 64 -/* If this flag is TRUE, then we will use the native aio of the -OS (provided we compiled Innobase with it in), otherwise we will -use simulated aio we build below with threads */ +/********************************************************************** -UNIV_INTERN ibool os_aio_use_native_aio = FALSE; +InnoDB AIO Implementation: +========================= + +We support native AIO for windows and linux. For rest of the platforms +we simulate AIO by special io-threads servicing the IO-requests. + +Simulated AIO: +============== + +In platforms where we 'simulate' AIO following is a rough explanation +of the high level design. +There are four io-threads (for ibuf, log, read, write). +All synchronous IO requests are serviced by the calling thread using +os_file_write/os_file_read. The Asynchronous requests are queued up +in an array (there are four such arrays) by the calling thread. +Later these requests are picked up by the io-thread and are serviced +synchronously. + +Windows native AIO: +================== + +If srv_use_native_aio is not set then windows follow the same +code as simulated AIO. If the flag is set then native AIO interface +is used. On windows, one of the limitation is that if a file is opened +for AIO no synchronous IO can be done on it. Therefore we have an +extra fifth array to queue up synchronous IO requests. +There are innodb_file_io_threads helper threads. These threads work +on the four arrays mentioned above in Simulated AIO. No thread is +required for the sync array. +If a synchronous IO request is made, it is first queued in the sync +array. Then the calling thread itself waits on the request, thus +making the call synchronous. +If an AIO request is made the calling thread not only queues it in the +array but also submits the requests. The helper thread then collects +the completed IO request and calls completion routine on it. + +Linux native AIO: +================= + +If we have libaio installed on the system and innodb_use_native_aio +is set to TRUE we follow the code path of native AIO, otherwise we +do simulated AIO. +There are innodb_file_io_threads helper threads. These threads work +on the four arrays mentioned above in Simulated AIO. +If a synchronous IO request is made, it is handled by calling +os_file_write/os_file_read. +If an AIO request is made the calling thread not only queues it in the +array but also submits the requests. The helper thread then collects +the completed IO request and calls completion routine on it. + +**********************************************************************/ UNIV_INTERN ibool os_aio_print_debug = FALSE; @@ -90,6 +142,10 @@ struct os_aio_slot_struct{ OVERLAPPED struct */ OVERLAPPED control; /* Windows control block for the aio request */ +#elif defined(LINUX_NATIVE_AIO) + struct iocb control; /* Linux control block for aio */ + int n_bytes; /* bytes written/read. */ + int ret; /* AIO return code */ #endif }; @@ -109,6 +165,10 @@ struct os_aio_array_struct{ ulint n_segments;/* Number of segments in the aio array of pending aio requests. A thread can wait separately for any one of the segments. */ + ulint cur_seg; /* We reserve IO requests in round robin + to different segments. This points to the + segment that is to be used to service + next IO request. */ ulint n_reserved;/* Number of reserved slots in the aio array outside the ibuf segment */ os_aio_slot_t* slots; /* Pointer to the slots in the array */ @@ -120,8 +180,31 @@ struct os_aio_array_struct{ in WaitForMultipleObjects; used only in Windows */ #endif + +#if defined(LINUX_NATIVE_AIO) + io_context_t* aio_ctx; + /* completion queue for IO. There is + one such queue per segment. Each thread + will work on one ctx exclusively. */ + struct io_event* aio_events; + /* The array to collect completed IOs. + There is one such event for each + possible pending IO. The size of the + array is equal to n_slots. */ +#endif }; +#if defined(LINUX_NATIVE_AIO) +/* timeout for each io_getevents() call = 500ms. */ +#define OS_AIO_REAP_TIMEOUT (500000000UL) + +/* time to sleep, in microseconds if io_setup() returns EAGAIN. */ +#define OS_AIO_IO_SETUP_RETRY_SLEEP (500000UL) + +/* number of attempts before giving up on io_setup(). */ +#define OS_AIO_IO_SETUP_RETRY_ATTEMPTS 5 +#endif + /* Array of events used in simulated aio */ static os_event_t* os_aio_segment_wait_events = NULL; @@ -133,6 +216,7 @@ static os_aio_array_t* os_aio_ibuf_array = NULL; static os_aio_array_t* os_aio_log_array = NULL; static os_aio_array_t* os_aio_sync_array = NULL; +/* Total number of segments. */ static ulint os_aio_n_segments = ULINT_UNDEFINED; /* If the following is TRUE, read i/o handler threads try to @@ -320,17 +404,29 @@ os_file_get_last_error( fflush(stderr); - if (err == ENOSPC) { + switch (err) { + case ENOSPC: return(OS_FILE_DISK_FULL); - } else if (err == ENOENT) { + case ENOENT: return(OS_FILE_NOT_FOUND); - } else if (err == EEXIST) { + case EEXIST: return(OS_FILE_ALREADY_EXISTS); - } else if (err == EXDEV || err == ENOTDIR || err == EISDIR) { + case EXDEV: + case ENOTDIR: + case EISDIR: return(OS_FILE_PATH_ERROR); - } else { - return(100 + err); + case EAGAIN: + if (srv_use_native_aio) { + return(OS_FILE_AIO_RESOURCES_RESERVED); + } + break; + case EINTR: + if (srv_use_native_aio) { + return(OS_FILE_AIO_INTERRUPTED); + } + break; } + return(100 + err); #endif } @@ -380,6 +476,9 @@ os_file_handle_error_cond_exit( return(FALSE); } else if (err == OS_FILE_AIO_RESOURCES_RESERVED) { + return(TRUE); + } else if (err == OS_FILE_AIO_INTERRUPTED) { + return(TRUE); } else if (err == OS_FILE_ALREADY_EXISTS || err == OS_FILE_PATH_ERROR) { @@ -1188,7 +1287,7 @@ os_file_create( buffering of writes in the OS */ attributes = 0; #ifdef WIN_ASYNC_IO - if (os_aio_use_native_aio) { + if (srv_use_native_aio) { attributes = attributes | FILE_FLAG_OVERLAPPED; } #endif @@ -2851,13 +2950,103 @@ os_aio_array_get_nth_slot( return((array->slots) + index); } -/**************************************************************************** -Creates an aio wait array. */ +#if defined(LINUX_NATIVE_AIO) +/********************************************************************** +Creates an io_context for native linux AIO. */ +static +ibool +os_aio_linux_create_io_ctx( +/*=======================*/ + /* out: TRUE on success. */ + ulint max_events, /* in: number of events. */ + io_context_t* io_ctx) /* out: io_ctx to initialize. */ +{ + int ret; + ulint retries = 0; + +retry: + memset(io_ctx, 0x0, sizeof(*io_ctx)); + + /* Initialize the io_ctx. Tell it how many pending + IO requests this context will handle. */ + + ret = io_setup(max_events, io_ctx); + if (ret == 0) { +#if defined(UNIV_AIO_DEBUG) + fprintf(stderr, + "InnoDB: Linux native AIO:" + " initialized io_ctx for segment\n"); +#endif + /* Success. Return now. */ + return(TRUE); + } + + /* If we hit EAGAIN we'll make a few attempts before failing. */ + + switch (ret) { + case -EAGAIN: + if (retries == 0) { + /* First time around. */ + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Warning: io_setup() failed" + " with EAGAIN. Will make %d attempts" + " before giving up.\n", + OS_AIO_IO_SETUP_RETRY_ATTEMPTS); + } + + if (retries < OS_AIO_IO_SETUP_RETRY_ATTEMPTS) { + ++retries; + fprintf(stderr, + "InnoDB: Warning: io_setup() attempt" + " %lu failed.\n", + retries); + os_thread_sleep(OS_AIO_IO_SETUP_RETRY_SLEEP); + goto retry; + } + + /* Have tried enough. Better call it a day. */ + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: io_setup() failed" + " with EAGAIN after %d attempts.\n", + OS_AIO_IO_SETUP_RETRY_ATTEMPTS); + break; + + case -ENOSYS: + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: Linux Native AIO interface" + " is not supported on this platform. Please" + " check your OS documentation and install" + " appropriate binary of InnoDB.\n"); + + break; + + default: + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: Linux Native AIO setup" + " returned following error[%d]\n", -ret); + break; + } + + fprintf(stderr, + "InnoDB: You can disable Linux Native AIO by" + " setting innodb_native_aio = off in my.cnf\n"); + return(FALSE); +} +#endif /* LINUX_NATIVE_AIO */ + +/********************************************************************** +Creates an aio wait array. Note that we return NULL in case of failure. +We don't care about freeing memory here because we assume that a +failure will result in server refusing to start up. */ static os_aio_array_t* os_aio_array_create( /*================*/ - /* out, own: aio array */ + /* out, own: aio array, NULL on failure */ ulint n, /* in: maximum number of pending aio operations allowed; n must be divisible by n_segments */ ulint n_segments) /* in: number of segments in the aio array */ @@ -2867,6 +3056,8 @@ os_aio_array_create( os_aio_slot_t* slot; #ifdef WIN_ASYNC_IO OVERLAPPED* over; +#elif defined(LINUX_NATIVE_AIO) + struct io_event* io_event = NULL; #endif ut_a(n > 0); ut_a(n_segments > 0); @@ -2882,10 +3073,44 @@ os_aio_array_create( array->n_slots = n; array->n_segments = n_segments; array->n_reserved = 0; + array->cur_seg = 0; array->slots = ut_malloc(n * sizeof(os_aio_slot_t)); #ifdef __WIN__ array->native_events = ut_malloc(n * sizeof(os_native_event_t)); #endif + +#if defined(LINUX_NATIVE_AIO) + /* If we are not using native aio interface then skip this + part of initialization. */ + if (!srv_use_native_aio) { + goto skip_native_aio; + } + + /* Initialize the io_context array. One io_context + per segment in the array. */ + + array->aio_ctx = ut_malloc(n_segments * + sizeof(*array->aio_ctx)); + for (i = 0; i < n_segments; ++i) { + if (!os_aio_linux_create_io_ctx(n/n_segments, + &array->aio_ctx[i])) { + /* If something bad happened during aio setup + we should call it a day and return right away. + We don't care about any leaks because a failure + to initialize the io subsystem means that the + server (or atleast the innodb storage engine) + is not going to startup. */ + return(NULL); + } + } + + /* Initialize the event array. One event per slot. */ + io_event = ut_malloc(n * sizeof(*io_event)); + memset(io_event, 0x0, sizeof(*io_event) * n); + array->aio_events = io_event; + +skip_native_aio: +#endif /* LINUX_NATIVE_AIO */ for (i = 0; i < n; i++) { slot = os_aio_array_get_nth_slot(array, i); @@ -2899,6 +3124,12 @@ os_aio_array_create( over->hEvent = slot->event->handle; *((array->native_events) + i) = over->hEvent; + +#elif defined(LINUX_NATIVE_AIO) + + memset(&slot->control, 0x0, sizeof(slot->control)); + slot->n_bytes = 0; + slot->ret = 0; #endif } @@ -2915,9 +3146,10 @@ in the three first aio arrays is the parameter n_segments given to the function. The caller must create an i/o handler thread for each segment in the four first arrays, but not for the sync aio array. */ UNIV_INTERN -void +ibool os_aio_init( /*========*/ + /* out: TRUE on success. */ ulint n, /* in: maximum number of pending aio operations allowed; n must be divisible by n_segments */ ulint n_segments, /* in: combined number of segments in the four @@ -2945,15 +3177,25 @@ os_aio_init( /* fprintf(stderr, "Array n per seg %lu\n", n_per_seg); */ os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1); + if (os_aio_ibuf_array == NULL) { + goto err_exit; + } srv_io_thread_function[0] = "insert buffer thread"; os_aio_log_array = os_aio_array_create(n_per_seg, 1); + if (os_aio_log_array == NULL) { + goto err_exit; + } srv_io_thread_function[1] = "log thread"; os_aio_read_array = os_aio_array_create(n_read_segs * n_per_seg, n_read_segs); + if (os_aio_read_array == NULL) { + goto err_exit; + } + for (i = 2; i < 2 + n_read_segs; i++) { ut_a(i < SRV_MAX_N_IO_THREADS); srv_io_thread_function[i] = "read thread"; @@ -2961,12 +3203,20 @@ os_aio_init( os_aio_write_array = os_aio_array_create(n_write_segs * n_per_seg, n_write_segs); + if (os_aio_write_array == NULL) { + goto err_exit; + } + for (i = 2 + n_read_segs; i < n_segments; i++) { ut_a(i < SRV_MAX_N_IO_THREADS); srv_io_thread_function[i] = "write thread"; } os_aio_sync_array = os_aio_array_create(n_slots_sync, 1); + if (os_aio_sync_array == NULL) { + goto err_exit; + } + os_aio_n_segments = n_segments; @@ -2980,6 +3230,11 @@ os_aio_init( os_last_printout = time(NULL); + return(TRUE); + +err_exit: + return(FALSE); + } #ifdef WIN_ASYNC_IO @@ -3017,6 +3272,19 @@ os_aio_wake_all_threads_at_shutdown(void) os_aio_array_wake_win_aio_at_shutdown(os_aio_write_array); os_aio_array_wake_win_aio_at_shutdown(os_aio_ibuf_array); os_aio_array_wake_win_aio_at_shutdown(os_aio_log_array); + +#elif defined(LINUX_NATIVE_AIO) + + /* When using native AIO interface the io helper threads + wait on io_getevents with a timeout value of 500ms. At + each wake up these threads check the server status. + No need to do anything to wake them up. */ + + if (srv_use_native_aio) { + return; + } + /* Fall through to simulated AIO handler wakeup if we are + not using native AIO. */ #endif /* This loop wakes up all simulated ai/o threads */ @@ -3135,18 +3403,25 @@ os_aio_array_reserve_slot( offset */ ulint len) /* in: length of the block to read or write */ { - os_aio_slot_t* slot; + os_aio_slot_t* slot = NULL; #ifdef WIN_ASYNC_IO OVERLAPPED* control; + +#elif defined(LINUX_NATIVE_AIO) + + struct iocb* iocb; + off_t aio_offset; + #endif ulint i; + ulint n; loop: os_mutex_enter(array->mutex); if (array->n_reserved == array->n_slots) { os_mutex_exit(array->mutex); - if (!os_aio_use_native_aio) { + if (!srv_use_native_aio) { /* If the handler threads are suspended, wake them so that we get more slots */ @@ -3158,14 +3433,38 @@ os_aio_array_reserve_slot( goto loop; } + /* First try to allocate a slot from the next segment in + round robin. */ + ut_a(array->cur_seg < array->n_segments); + + n = array->n_slots / array->n_segments; + for (i = array->cur_seg * n; i < ((array->cur_seg + 1) * n); i++) { + slot = os_aio_array_get_nth_slot(array, i); + + if (slot->reserved == FALSE) { + goto found; + } + } + + ut_ad(i < array->n_slots); + array->cur_seg = (array->cur_seg + 1) % array->n_segments; + + /* If we are unable to find a slot in our desired segment we do + a linear search of entire array. We are guaranteed to find a + slot in linear search. */ for (i = 0;; i++) { slot = os_aio_array_get_nth_slot(array, i); if (slot->reserved == FALSE) { - break; + goto found; } } + /* We MUST always be able to get hold of a reserved slot. */ + ut_error; +found: + ut_ad(!slot->reserved); + array->n_reserved++; if (array->n_reserved == 1) { @@ -3194,8 +3493,42 @@ os_aio_array_reserve_slot( control->Offset = (DWORD)offset; control->OffsetHigh = (DWORD)offset_high; os_event_reset(slot->event); -#endif +#elif defined(LINUX_NATIVE_AIO) + + /* If we are not using native AIO skip this part. */ + if (!srv_use_native_aio) { + goto skip_native_aio; + } + + /* Check if we are dealing with 64 bit arch. + If not then make sure that offset fits in 32 bits. */ + if (sizeof(aio_offset) == 8) { + aio_offset = offset_high; + aio_offset <<= 32; + aio_offset += offset; + } else { + ut_a(offset_high == 0); + aio_offset = offset; + } + + iocb = &slot->control; + + if (type == OS_FILE_READ) { + io_prep_pread(iocb, file, buf, len, aio_offset); + } else { + ut_a(type == OS_FILE_WRITE); + io_prep_pwrite(iocb, file, buf, len, aio_offset); + } + + iocb->data = (void*)slot; + slot->n_bytes = 0; + slot->ret = 0; + /*fprintf(stderr, "Filled up Linux native iocb.\n");*/ + + +skip_native_aio: +#endif /* LINUX_NATIVE_AIO */ os_mutex_exit(array->mutex); return(slot); @@ -3230,7 +3563,23 @@ os_aio_array_free_slot( } #ifdef WIN_ASYNC_IO + os_event_reset(slot->event); + +#elif defined(LINUX_NATIVE_AIO) + + if (srv_use_native_aio) { + memset(&slot->control, 0x0, sizeof(slot->control)); + slot->n_bytes = 0; + slot->ret = 0; + /*fprintf(stderr, "Freed up Linux native slot.\n");*/ + } else { + /* These fields should not be used if we are not + using native AIO. */ + ut_ad(slot->n_bytes == 0); + ut_ad(slot->ret == 0); + } + #endif os_mutex_exit(array->mutex); } @@ -3250,7 +3599,7 @@ os_aio_simulated_wake_handler_thread( ulint n; ulint i; - ut_ad(!os_aio_use_native_aio); + ut_ad(!srv_use_native_aio); segment = os_aio_get_array_and_local_segment(&array, global_segment); @@ -3286,7 +3635,7 @@ os_aio_simulated_wake_handler_threads(void) { ulint i; - if (os_aio_use_native_aio) { + if (srv_use_native_aio) { /* We do not use simulated aio: do nothing */ return; @@ -3324,6 +3673,54 @@ os_aio_simulated_put_read_threads_to_sleep(void) } } +#if defined(LINUX_NATIVE_AIO) +/*********************************************************************** +Dispatch an AIO request to the kernel. */ +static +ibool +os_aio_linux_dispatch( +/*==================*/ + /* out: TRUE on success. */ + os_aio_array_t* array, /* in: io request array. */ + os_aio_slot_t* slot) /* in: an already reserved slot. */ +{ + int ret; + ulint io_ctx_index; + struct iocb* iocb; + + ut_ad(slot != NULL); + ut_ad(array); + + ut_a(slot->reserved); + + /* Find out what we are going to work with. + The iocb struct is directly in the slot. + The io_context is one per segment. */ + + iocb = &slot->control; + io_ctx_index = (slot->pos * array->n_segments) / array->n_slots; + + ret = io_submit(array->aio_ctx[io_ctx_index], 1, &iocb); + +#if defined(UNIV_AIO_DEBUG) + fprintf(stderr, + "io_submit[%c] ret[%d]: slot[%p] ctx[%p] seg[%lu]\n", + (slot->type == OS_FILE_WRITE) ? 'w' : 'r', ret, slot, + array->aio_ctx[io_ctx_index], (ulong)io_ctx_index); +#endif + + /* io_submit returns number of successfully + queued requests or -errno. */ + if (UNIV_UNLIKELY(ret != 1)) { + errno = -ret; + return(FALSE); + } + + return(TRUE); +} +#endif /* LINUX_NATIVE_AIO */ + + /*********************************************************************** Requests an asynchronous i/o operation. */ UNIV_INTERN @@ -3372,7 +3769,6 @@ os_aio( void* dummy_mess2; ulint dummy_type; #endif - ulint err = 0; ibool retry; ulint wake_later; @@ -3388,7 +3784,7 @@ os_aio( if (mode == OS_AIO_SYNC #ifdef WIN_ASYNC_IO - && !os_aio_use_native_aio + && !srv_use_native_aio #endif ) { /* This is actually an ordinary synchronous read or write: @@ -3428,6 +3824,11 @@ os_aio( array = os_aio_log_array; } else if (mode == OS_AIO_SYNC) { array = os_aio_sync_array; + +#if defined(LINUX_NATIVE_AIO) + /* In Linux native AIO we don't use sync IO array. */ + ut_a(!srv_use_native_aio); +#endif } else { array = NULL; /* Eliminate compiler warning */ ut_error; @@ -3436,13 +3837,17 @@ os_aio( slot = os_aio_array_reserve_slot(type, array, message1, message2, file, name, buf, offset, offset_high, n); if (type == OS_FILE_READ) { - if (os_aio_use_native_aio) { -#ifdef WIN_ASYNC_IO + if (srv_use_native_aio) { os_n_file_reads++; - os_bytes_read_since_printout += len; - + os_bytes_read_since_printout += n; +#ifdef WIN_ASYNC_IO ret = ReadFile(file, buf, (DWORD)n, &len, &(slot->control)); + +#elif defined(LINUX_NATIVE_AIO) + if (!os_aio_linux_dispatch(array, slot)) { + goto err_exit; + } #endif } else { if (!wake_later) { @@ -3452,11 +3857,16 @@ os_aio( } } } else if (type == OS_FILE_WRITE) { - if (os_aio_use_native_aio) { -#ifdef WIN_ASYNC_IO + if (srv_use_native_aio) { os_n_file_writes++; +#ifdef WIN_ASYNC_IO ret = WriteFile(file, buf, (DWORD)n, &len, &(slot->control)); + +#elif defined(LINUX_NATIVE_AIO) + if (!os_aio_linux_dispatch(array, slot)) { + goto err_exit; + } #endif } else { if (!wake_later) { @@ -3470,7 +3880,7 @@ os_aio( } #ifdef WIN_ASYNC_IO - if (os_aio_use_native_aio) { + if (srv_use_native_aio) { if ((ret && len == n) || (!ret && GetLastError() == ERROR_IO_PENDING)) { /* aio was queued successfully! */ @@ -3493,15 +3903,13 @@ os_aio( return(TRUE); } - err = 1; /* Fall through the next if */ + goto err_exit; } #endif - if (err == 0) { - /* aio was queued successfully! */ - - return(TRUE); - } + /* aio was queued successfully! */ + return(TRUE); +err_exit: os_aio_array_free_slot(array, slot); retry = os_file_handle_error(name, @@ -3604,7 +4012,9 @@ os_aio_windows_handle( #ifdef UNIV_DO_FLUSH if (slot->type == OS_FILE_WRITE && !os_do_not_call_flush_at_each_write) { - ut_a(TRUE == os_file_flush(slot->file)); + if (!os_file_flush(slot->file)) { + ut_error; + } } #endif /* UNIV_DO_FLUSH */ } else { @@ -3621,6 +4031,257 @@ os_aio_windows_handle( } #endif +#if defined(LINUX_NATIVE_AIO) +/********************************************************************** +This function is only used in Linux native asynchronous i/o. This is +called from within the io-thread. If there are no completed IO requests +in the slot array, the thread calls this function to collect more +requests from the kernel. +The io-thread waits on io_getevents(), which is a blocking call, with +a timeout value. Unless the system is very heavy loaded, keeping the +io-thread very busy, the io-thread will spend most of its time waiting +in this function. +The io-thread also exits in this function. It checks server status at +each wakeup and that is why we use timed wait in io_getevents(). */ +static +void +os_aio_linux_collect( +/*=================*/ + os_aio_array_t* array, /* in/out: slot array. */ + ulint segment, /* in: local segment no. */ + ulint seg_size) /* in: segment size. */ +{ + int i; + int ret; + ulint start_pos; + ulint end_pos; + struct timespec timeout; + struct io_event* events; + struct io_context* io_ctx; + + /* sanity checks. */ + ut_ad(array != NULL); + ut_ad(seg_size > 0); + ut_ad(segment < array->n_segments); + + /* Which part of event array we are going to work on. */ + events = &array->aio_events[segment * seg_size]; + + /* Which io_context we are going to use. */ + io_ctx = array->aio_ctx[segment]; + + /* Starting point of the segment we will be working on. */ + start_pos = segment * seg_size; + + /* End point. */ + end_pos = start_pos + seg_size; + +retry: + + /* Go down if we are in shutdown mode. + In case of srv_fast_shutdown == 2, there may be pending + IO requests but that should be OK as we essentially treat + that as a crash of InnoDB. */ + if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) { + os_thread_exit(NULL); + } + + /* Initialize the events. The timeout value is arbitrary. + We probably need to experiment with it a little. */ + memset(events, 0, sizeof(*events) * seg_size); + timeout.tv_sec = 0; + timeout.tv_nsec = OS_AIO_REAP_TIMEOUT; + + ret = io_getevents(io_ctx, 1, seg_size, events, &timeout); + + /* This error handling is for any error in collecting the + IO requests. The errors, if any, for any particular IO + request are simply passed on to the calling routine. */ + + /* Not enough resources! Try again. */ + if (ret == -EAGAIN) { + goto retry; + } + + /* Interrupted! I have tested the behaviour in case of an + interrupt. If we have some completed IOs available then + the return code will be the number of IOs. We get EINTR only + if there are no completed IOs and we have been interrupted. */ + if (ret == -EINTR) { + goto retry; + } + + /* No pending request! Go back and check again. */ + if (ret == 0) { + goto retry; + } + + /* All other errors! should cause a trap for now. */ + if (UNIV_UNLIKELY(ret < 0)) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: unexpected ret_code[%d] from" + " io_getevents()!\n", ret); + ut_error; + } + + ut_a(ret > 0); + + for (i = 0; i < ret; i++) { + os_aio_slot_t* slot; + struct iocb* control; + + control = (struct iocb *)events[i].obj; + ut_a(control != NULL); + + slot = (os_aio_slot_t *) control->data; + + /* Some sanity checks. */ + ut_a(slot != NULL); + ut_a(slot->reserved); + +#if defined(UNIV_AIO_DEBUG) + fprintf(stderr, + "io_getevents[%c]: slot[%p] ctx[%p]" + " seg[%lu]\n", + (slot->type == OS_FILE_WRITE) ? 'w' : 'r', + slot, io_ctx, segment); +#endif + + /* We are not scribbling previous segment. */ + ut_a(slot->pos >= start_pos); + + /* We have not overstepped to next segment. */ + ut_a(slot->pos < end_pos); + + /* Mark this request as completed. The error handling + will be done in the calling function. */ + os_mutex_enter(array->mutex); + slot->n_bytes = events[i].res; + slot->ret = events[i].res2; + slot->io_already_done = TRUE; + os_mutex_exit(array->mutex); + } + + return; +} + +/************************************************************************** +This function is only used in Linux native asynchronous i/o. +Waits for an aio operation to complete. This function is used to wait for +the completed requests. The aio array of pending requests is divided +into segments. The thread specifies which segment or slot it wants to wait +for. NOTE: this function will also take care of freeing the aio slot, +therefore no other thread is allowed to do the freeing! */ +UNIV_INTERN +ibool +os_aio_linux_handle( +/*================*/ + /* out: TRUE if the IO was successful */ + ulint global_seg, /* in: segment number in the aio array + to wait for; segment 0 is the ibuf + i/o thread, segment 1 is log i/o thread, + then follow the non-ibuf read threads, + and the last are the non-ibuf write + threads. */ + fil_node_t**message1, /* out: the messages passed with the */ + void** message2, /* aio request; note that in case the + aio operation failed, these output + parameters are valid and can be used to + restart the operation. */ + ulint* type) /* out: OS_FILE_WRITE or ..._READ */ +{ + ulint segment; + os_aio_array_t* array; + os_aio_slot_t* slot; + ulint n; + ulint i; + ibool ret = FALSE; + + /* Should never be doing Sync IO here. */ + ut_a(global_seg != ULINT_UNDEFINED); + + /* Find the array and the local segment. */ + segment = os_aio_get_array_and_local_segment(&array, global_seg); + n = array->n_slots / array->n_segments; + + /* Loop until we have found a completed request. */ + for (;;) { + os_mutex_enter(array->mutex); + for (i = 0; i < n; ++i) { + slot = os_aio_array_get_nth_slot( + array, i + segment * n); + if (slot->reserved && slot->io_already_done) { + /* Something for us to work on. */ + goto found; + } + } + + os_mutex_exit(array->mutex); + + /* We don't have any completed request. + Wait for some request. Note that we return + from wait iff we have found a request. */ + + srv_set_io_thread_op_info(global_seg, + "waiting for completed aio requests"); + os_aio_linux_collect(array, segment, n); + } + +found: + /* Note that it may be that there are more then one completed + IO requests. We process them one at a time. We may have a case + here to improve the performance slightly by dealing with all + requests in one sweep. */ + srv_set_io_thread_op_info(global_seg, + "processing completed aio requests"); + + /* Ensure that we are scribbling only our segment. */ + ut_a(i < n); + + ut_ad(slot != NULL); + ut_ad(slot->reserved); + ut_ad(slot->io_already_done); + + *message1 = slot->message1; + *message2 = slot->message2; + + *type = slot->type; + + if ((slot->ret == 0) && (slot->n_bytes == (long)slot->len)) { + ret = TRUE; + +#ifdef UNIV_DO_FLUSH + if (slot->type == OS_FILE_WRITE + && !os_do_not_call_flush_at_each_write) + && !os_file_flush(slot->file) { + ut_error; + } +#endif /* UNIV_DO_FLUSH */ + } else { + errno = -slot->ret; + + /* os_file_handle_error does tell us if we should retry + this IO. As it stands now, we don't do this retry when + reaping requests from a different context than + the dispatcher. This non-retry logic is the same for + windows and linux native AIO. + We should probably look into this to transparently + re-submit the IO. */ + os_file_handle_error(slot->name, "Linux aio"); + + ret = FALSE; + } + + os_mutex_exit(array->mutex); + + os_aio_array_free_slot(array, slot); + + return(ret); +} + +#endif /* LINUX_NATIVE_AIO */ + /************************************************************************** Does simulated aio. This function should be called by an i/o-handler thread. */ @@ -3995,6 +4656,40 @@ os_aio_validate(void) return(TRUE); } +/************************************************************************** +Prints pending IO requests per segment of an aio array. +We probably don't need per segment statistics but they can help us +during development phase to see if the IO requests are being +distributed as expected. */ +static +void +os_aio_print_segment_info( +/*======================*/ + FILE* file, /* in: file where to print */ + ulint* n_seg, /* in: pending IO array */ + os_aio_array_t* array) /* in: array to process */ +{ + ulint i; + + ut_ad(array); + ut_ad(n_seg); + ut_ad(array->n_segments > 0); + + if (array->n_segments == 1) { + return; + } + + fprintf(file, " ["); + for (i = 0; i < array->n_segments; i++) { + if (i != 0) { + fprintf(file, ", "); + } + + fprintf(file, "%lu", n_seg[i]); + } + fprintf(file, "] "); +} + /************************************************************************** Prints info of the aio arrays. */ UNIV_INTERN @@ -4006,6 +4701,7 @@ os_aio_print( os_aio_array_t* array; os_aio_slot_t* slot; ulint n_reserved; + ulint n_res_seg[SRV_MAX_N_IO_THREADS]; time_t current_time; double time_elapsed; double avg_bytes_read; @@ -4038,11 +4734,15 @@ os_aio_print( n_reserved = 0; + memset(n_res_seg, 0x0, sizeof(n_res_seg)); + for (i = 0; i < array->n_slots; i++) { slot = os_aio_array_get_nth_slot(array, i); + ulint seg_no = (i * array->n_segments) / array->n_slots; if (slot->reserved) { n_reserved++; + n_res_seg[seg_no]++; #if 0 fprintf(stderr, "Reserved slot, messages %p %p\n", (void*) slot->message1, @@ -4056,6 +4756,8 @@ os_aio_print( fprintf(file, " %lu", (ulong) n_reserved); + os_aio_print_segment_info(file, n_res_seg, array); + os_mutex_exit(array->mutex); if (array == os_aio_read_array) { diff --git a/plug.in b/plug.in index 34ad5d77c0d..ec71d028d50 100644 --- a/plug.in +++ b/plug.in @@ -12,6 +12,14 @@ MYSQL_PLUGIN_ACTIONS(innobase, [ AC_C_BIGENDIAN case "$target_os" in lin*) + AC_CHECK_HEADER(libaio.h, + AC_CHECK_LIB(aio, io_setup, + LIBS="$LIBS -laio" + AC_DEFINE(LINUX_NATIVE_AIO, [1], + [Linux native async I/O support]), + AC_MSG_WARN([No Linux native async I/O])), + AC_MSG_WARN([No Linux native async I/O])) + CFLAGS="$CFLAGS -DUNIV_LINUX";; hpux10*) CFLAGS="$CFLAGS -DUNIV_MUST_NOT_INLINE -DUNIV_HPUX -DUNIV_HPUX10";; diff --git a/srv/srv0srv.c b/srv/srv0srv.c index d75269e96d5..2dea4dad943 100644 --- a/srv/srv0srv.c +++ b/srv/srv0srv.c @@ -102,6 +102,12 @@ UNIV_INTERN ulint srv_check_file_format_at_startup = DICT_TF_FORMAT_MAX; on duplicate key checking and foreign key checking */ UNIV_INTERN ibool srv_locks_unsafe_for_binlog = FALSE; +/* If this flag is TRUE, then we will use the native aio of the +OS (provided we compiled Innobase with it in), otherwise we will +use simulated aio we build below with threads. +Currently we support native aio on windows and linux */ +UNIV_INTERN my_bool srv_use_native_aio = TRUE; + UNIV_INTERN ulint srv_n_data_files = 0; UNIV_INTERN char** srv_data_file_names = NULL; /* size in database pages */ diff --git a/srv/srv0start.c b/srv/srv0start.c index 1e8c10c13bb..696c4a51c8f 100644 --- a/srv/srv0start.c +++ b/srv/srv0start.c @@ -969,6 +969,7 @@ innobase_start_or_create_for_mysql(void) ibool log_file_created; ibool log_created = FALSE; ibool log_opened = FALSE; + ibool success; ib_uint64_t min_flushed_lsn; ib_uint64_t max_flushed_lsn; #ifdef UNIV_LOG_ARCHIVE @@ -1071,7 +1072,6 @@ innobase_start_or_create_for_mysql(void) srv_is_being_started = TRUE; srv_startup_is_before_trx_rollback_phase = TRUE; - os_aio_use_native_aio = FALSE; #ifdef __WIN__ if (os_get_os_version() == OS_WIN95 @@ -1083,12 +1083,30 @@ innobase_start_or_create_for_mysql(void) but when run in conjunction with InnoDB Hot Backup, it seemed to corrupt the data files. */ - os_aio_use_native_aio = FALSE; + srv_use_native_aio = FALSE; } else { /* On Win 2000 and XP use async i/o */ - os_aio_use_native_aio = TRUE; + srv_use_native_aio = TRUE; } + +#elif defined(LINUX_NATIVE_AIO) + + if (srv_use_native_aio) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Using Linux native AIO\n"); + } +#else + /* Currently native AIO is supported only on windows and linux + and that also when the support is compiled in. In all other + cases, we ignore the setting of innodb_use_native_aio. */ + + /* TODO: comment this out after internal testing. */ + fprintf(stderr, "Ignoring innodb_use_native_aio\n"); + srv_use_native_aio = FALSE; + #endif + if (srv_file_flush_method_str == NULL) { /* These are the default options */ @@ -1113,11 +1131,11 @@ innobase_start_or_create_for_mysql(void) #else } else if (0 == ut_strcmp(srv_file_flush_method_str, "normal")) { srv_win_file_flush_method = SRV_WIN_IO_NORMAL; - os_aio_use_native_aio = FALSE; + srv_use_native_aio = FALSE; } else if (0 == ut_strcmp(srv_file_flush_method_str, "unbuffered")) { srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED; - os_aio_use_native_aio = FALSE; + srv_use_native_aio = FALSE; } else if (0 == ut_strcmp(srv_file_flush_method_str, "async_unbuffered")) { @@ -1210,19 +1228,38 @@ innobase_start_or_create_for_mysql(void) srv_n_file_io_threads = SRV_MAX_N_IO_THREADS; } - if (!os_aio_use_native_aio) { + if (!srv_use_native_aio) { /* In simulated aio we currently have use only for 4 threads */ srv_n_file_io_threads = 4; - os_aio_init(8 * SRV_N_PENDING_IOS_PER_THREAD - * srv_n_file_io_threads, - srv_n_file_io_threads, - SRV_MAX_N_PENDING_SYNC_IOS); + success = os_aio_init(8 * SRV_N_PENDING_IOS_PER_THREAD * + srv_n_file_io_threads, + srv_n_file_io_threads, + SRV_MAX_N_PENDING_SYNC_IOS); + if (!success) { + return(DB_ERROR); + } } else { - os_aio_init(SRV_N_PENDING_IOS_PER_THREAD - * srv_n_file_io_threads, - srv_n_file_io_threads, - SRV_MAX_N_PENDING_SYNC_IOS); + /* Windows has a pending IO per thread limit. + Linux does not have any such restriction. + The question of what should be segment size + is a trade off. The larger size means longer + linear searches through the array and a smaller + value can lead to array being full, causing + unnecessary delays. The following value + for Linux is fairly arbitrary and needs to be + tested and tuned. */ + success = os_aio_init( +#if defined(LINUX_NATIVE_AIO) + 8 * +#endif /* LINUX_NATIVE_AIO */ + SRV_N_PENDING_IOS_PER_THREAD * + srv_n_file_io_threads, + srv_n_file_io_threads, + SRV_MAX_N_PENDING_SYNC_IOS); + if (!success) { + return(DB_ERROR); + } } fil_init(srv_max_n_open_files);