diff --git a/cmake/os/WindowsCache.cmake b/cmake/os/WindowsCache.cmake index c1048661aaa..75d21f6ca41 100644 --- a/cmake/os/WindowsCache.cmake +++ b/cmake/os/WindowsCache.cmake @@ -61,7 +61,6 @@ SET(HAVE_GETIFADDRS CACHE INTERNAL "") SET(HAVE_GETCWD 1 CACHE INTERNAL "") SET(HAVE_GETHOSTBYADDR_R CACHE INTERNAL "") SET(HAVE_GETHRTIME CACHE INTERNAL "") -SET(HAVE_GETPAGESIZE CACHE INTERNAL "") SET(HAVE_GETPASS CACHE INTERNAL "") SET(HAVE_GETMNTENT CACHE INTERNAL "") SET(HAVE_GETMNTENT_IN_SYS_MNTAB CACHE INTERNAL "") diff --git a/config.h.cmake b/config.h.cmake index 49783ce6b50..b81c4148ccb 100644 --- a/config.h.cmake +++ b/config.h.cmake @@ -151,7 +151,6 @@ #cmakedefine HAVE_GETCWD 1 #cmakedefine HAVE_GETHOSTBYADDR_R 1 #cmakedefine HAVE_GETHRTIME 1 -#cmakedefine HAVE_GETPAGESIZE 1 #cmakedefine HAVE_GETPAGESIZES 1 #cmakedefine HAVE_GETPASS 1 #cmakedefine HAVE_GETPASSPHRASE 1 diff --git a/configure.cmake b/configure.cmake index 273c155158e..474c82cb59c 100644 --- a/configure.cmake +++ b/configure.cmake @@ -463,7 +463,6 @@ CHECK_SYMBOL_EXISTS(madvise "sys/mman.h" HAVE_DECL_MADVISE) CHECK_SYMBOL_EXISTS(getpagesizes "sys/mman.h" HAVE_GETPAGESIZES) CHECK_SYMBOL_EXISTS(tzname "time.h" HAVE_TZNAME) CHECK_SYMBOL_EXISTS(lrand48 "stdlib.h" HAVE_LRAND48) -CHECK_SYMBOL_EXISTS(getpagesize "unistd.h" HAVE_GETPAGESIZE) CHECK_SYMBOL_EXISTS(TIOCGWINSZ "sys/ioctl.h" GWINSZ_IN_SYS_IOCTL) CHECK_SYMBOL_EXISTS(FIONREAD "sys/ioctl.h" FIONREAD_IN_SYS_IOCTL) CHECK_SYMBOL_EXISTS(TIOCSTAT "sys/ioctl.h" TIOCSTAT_IN_SYS_IOCTL) diff --git a/extra/mariabackup/xtrabackup.cc b/extra/mariabackup/xtrabackup.cc index ca8891ca546..e5d05b230fd 100644 --- a/extra/mariabackup/xtrabackup.cc +++ b/extra/mariabackup/xtrabackup.cc @@ -205,6 +205,8 @@ lsn_t checkpoint_lsn_start; lsn_t checkpoint_no_start; /** whether log_copying_thread() is active; protected by recv_sys.mutex */ static bool log_copying_running; +/** for --backup, target LSN to copy the log to; protected by recv_sys.mutex */ +lsn_t metadata_to_lsn; uint xtrabackup_parallel; @@ -236,7 +238,6 @@ my_bool opt_encrypted_backup; #define XTRABACKUP_METADATA_FILENAME "xtrabackup_checkpoints" char metadata_type[30] = ""; /*[full-backuped|log-applied|incremental]*/ static lsn_t metadata_from_lsn; -lsn_t metadata_to_lsn; static lsn_t metadata_last_lsn; static ds_file_t* dst_log_file; @@ -282,9 +283,6 @@ my_bool xtrabackup_incremental_force_scan = FALSE; */ ulong xtrabackup_innodb_force_recovery = 0; -/* The flushed lsn which is read from data files */ -lsn_t flushed_lsn= 0; - ulong xb_open_files_limit= 0; char *xb_plugin_dir; char *xb_plugin_load; @@ -1329,6 +1327,9 @@ enum options_xtrabackup OPT_INNODB_BUFFER_POOL_FILENAME, OPT_INNODB_LOCK_WAIT_TIMEOUT, OPT_INNODB_LOG_BUFFER_SIZE, +#ifdef HAVE_INNODB_MMAP + OPT_INNODB_LOG_FILE_MMAP, +#endif #if defined __linux__ || defined _WIN32 OPT_INNODB_LOG_FILE_BUFFERING, #endif @@ -1890,6 +1891,13 @@ struct my_option xb_server_options[] = (G_PTR*) &log_sys.buf_size, (G_PTR*) &log_sys.buf_size, 0, GET_UINT, REQUIRED_ARG, 2U << 20, 2U << 20, log_sys.buf_size_max, 0, 4096, 0}, +#ifdef HAVE_INNODB_MMAP + {"innodb_log_file_mmap", OPT_INNODB_LOG_FILE_SIZE, + "Whether ib_logfile0 should be memory-mapped", + (G_PTR*) &log_sys.log_mmap, + (G_PTR*) &log_sys.log_mmap, 0, GET_BOOL, NO_ARG, + log_sys.log_mmap_default, 0, 0, 0, 0, 0}, +#endif #if defined __linux__ || defined _WIN32 {"innodb_log_file_buffering", OPT_INNODB_LOG_FILE_BUFFERING, "Whether the file system cache for ib_logfile0 is enabled during --backup", @@ -3368,8 +3376,108 @@ skip: return(FALSE); } +#ifdef HAVE_INNODB_MMAP +static int +xtrabackup_copy_mmap_snippet(ds_file_t *ds, const byte *start, const byte *end) +{ + if (UNIV_UNLIKELY(start > end)) + { + if (int r= ds_write(ds, start, log_sys.buf + log_sys.file_size - start)) + return r; + start= log_sys.buf + log_sys.START_OFFSET; + } + return ds_write(ds, start, end - start); +} + +/** Copy memory-mapped log until the end of the log is reached +or the log_copying_stop signal is received +@return whether the operation failed */ +static bool xtrabackup_copy_mmap_logfile() +{ + mysql_mutex_assert_owner(&recv_sys.mutex); + recv_sys.offset= size_t(log_sys.calc_lsn_offset(recv_sys.lsn)); + recv_sys.len= size_t(log_sys.file_size); + const size_t seq_offset{log_sys.is_encrypted() ? 8U + 5U : 5U}; + const char one{'\1'}; + + for (unsigned retry_count{0};;) + { + recv_sys_t::parse_mtr_result r; + const byte *start= &log_sys.buf[recv_sys.offset]; + + if (recv_sys.parse_mmap(false) == recv_sys_t::OK) + { + const byte *end; + + do + { + /* Set the sequence bit (the backed-up log will not wrap around) */ + size_t seqo= recv_sys.offset - seq_offset; + if (seqo < log_sys.START_OFFSET) + seqo+= log_sys.file_size - log_sys.START_OFFSET; + const byte *seq= &log_sys.buf[seqo]; + ut_ad(*seq == log_sys.get_sequence_bit(recv_sys.lsn - seq_offset)); + if (!*seq) + { + if (xtrabackup_copy_mmap_snippet(dst_log_file, start, seq) || + ds_write(dst_log_file, &one, 1)) + goto write_error; + start = seq + 1; + } + } + while ((r= recv_sys.parse_mmap(false)) == recv_sys_t::OK); + + end= &log_sys.buf[recv_sys.offset]; + + if (xtrabackup_copy_mmap_snippet(dst_log_file, start, end)) + { + write_error: + msg("Error: write to ib_logfile0 failed"); + return true; + } + + start= end; + + pthread_cond_broadcast(&scanned_lsn_cond); + + if (r == recv_sys_t::GOT_EOF) + break; + + retry_count= 0; + } + else + { + if (metadata_to_lsn) + { + if (metadata_to_lsn <= recv_sys.lsn) + return false; + } + else if (xtrabackup_throttle && io_ticket-- < 0) + mysql_cond_wait(&wait_throttle, &recv_sys.mutex); + + if (!retry_count++) + msg("Retrying read of log at LSN=" LSN_PF, recv_sys.lsn); + else if (retry_count == 100) + break; + else + { + timespec abstime; + set_timespec_nsec(abstime, 1000000ULL /* 1 ms */); + if (!mysql_cond_timedwait(&log_copying_stop, &recv_sys.mutex, + &abstime)) + return true; + } + } + } + + if (verbose) + msg(">> log scanned up to (" LSN_PF ")", recv_sys.lsn); + return false; +} +#endif + /** Copy redo log until the current end of the log is reached -@return whether the operation failed */ +@return whether the operation failed */ static bool xtrabackup_copy_logfile() { mysql_mutex_assert_owner(&recv_sys.mutex); @@ -3377,16 +3485,17 @@ static bool xtrabackup_copy_logfile() ut_a(dst_log_file); ut_ad(recv_sys.is_initialised()); + +#ifdef HAVE_INNODB_MMAP + if (log_sys.is_mmap()) + return xtrabackup_copy_mmap_logfile(); +#endif const size_t sequence_offset{log_sys.is_encrypted() ? 8U + 5U : 5U}; const size_t block_size_1{log_sys.write_size - 1}; - ut_ad(!log_sys.is_pmem()); - - { - recv_sys.offset= size_t(recv_sys.lsn - log_sys.get_first_lsn()) & - block_size_1; - recv_sys.len= 0; - } + recv_sys.offset= size_t(recv_sys.lsn - log_sys.get_first_lsn()) & + block_size_1; + recv_sys.len= 0; for (unsigned retry_count{0};;) { @@ -5376,9 +5485,8 @@ fail: goto fail; } - if (!log_sys.create()) { - goto fail; - } + log_sys.create(); + /* get current checkpoint_lsn */ { log_sys.latch.wr_lock(SRW_LOCK_CALL); @@ -6730,9 +6838,7 @@ error: } recv_sys.create(); - if (!log_sys.create()) { - goto error; - } + log_sys.create(); recv_sys.recovery_on = true; xb_fil_io_init(); diff --git a/include/my_sys.h b/include/my_sys.h index e4ab497a0e8..0797f96607e 100644 --- a/include/my_sys.h +++ b/include/my_sys.h @@ -1017,11 +1017,7 @@ extern int my_win_pclose(FILE*); #endif /* my_getpagesize */ -#ifdef HAVE_GETPAGESIZE -#define my_getpagesize() getpagesize() -#else int my_getpagesize(void); -#endif int my_msync(int, void *, size_t, int); diff --git a/mysql-test/suite/innodb/r/log_file_size_online.result b/mysql-test/suite/innodb/r/log_file_size_online.result index e4c4e899995..e34d4f3cdc2 100644 --- a/mysql-test/suite/innodb/r/log_file_size_online.result +++ b/mysql-test/suite/innodb/r/log_file_size_online.result @@ -19,6 +19,12 @@ SHOW VARIABLES LIKE 'innodb_log_file_size'; Variable_name Value innodb_log_file_size 4194304 FOUND 1 /InnoDB: Resized log to 4\.000MiB/ in mysqld.1.err +SET @save=@@GLOBAL.innodb_log_file_buffering; +SET GLOBAL innodb_log_file_buffering=OFF; +SET GLOBAL innodb_log_file_buffering=ON; +SET GLOBAL innodb_log_file_buffering=@save; +SET GLOBAL innodb_log_file_mmap=OFF; +Got one of the listed errors SET GLOBAL innodb_log_file_size=5242880; connect con1,localhost,root; UPDATE t SET b='' WHERE a<10; diff --git a/mysql-test/suite/innodb/t/log_file_size_online.test b/mysql-test/suite/innodb/t/log_file_size_online.test index 3b56144ca43..14224ab9c47 100644 --- a/mysql-test/suite/innodb/t/log_file_size_online.test +++ b/mysql-test/suite/innodb/t/log_file_size_online.test @@ -25,6 +25,17 @@ SHOW VARIABLES LIKE 'innodb_log_file_size'; let SEARCH_PATTERN = InnoDB: Resized log to 4\\.000MiB; --source include/search_pattern_in_file.inc +--error 0,ER_UNKNOWN_SYSTEM_VARIABLE +SET @save=@@GLOBAL.innodb_log_file_buffering; +--error 0,ER_UNKNOWN_SYSTEM_VARIABLE +SET GLOBAL innodb_log_file_buffering=OFF; +--error 0,ER_UNKNOWN_SYSTEM_VARIABLE +SET GLOBAL innodb_log_file_buffering=ON; +--error 0,ER_UNKNOWN_SYSTEM_VARIABLE +SET GLOBAL innodb_log_file_buffering=@save; +--error ER_INCORRECT_GLOBAL_LOCAL_VAR,ER_UNKNOWN_SYSTEM_VARIABLE +SET GLOBAL innodb_log_file_mmap=OFF; + send SET GLOBAL innodb_log_file_size=5242880; --connect con1,localhost,root diff --git a/mysql-test/suite/sys_vars/r/sysvars_innodb.result b/mysql-test/suite/sys_vars/r/sysvars_innodb.result index a1228716b1d..e4d8480c1fc 100644 --- a/mysql-test/suite/sys_vars/r/sysvars_innodb.result +++ b/mysql-test/suite/sys_vars/r/sysvars_innodb.result @@ -4,6 +4,7 @@ variable_name not in ( 'innodb_numa_interleave', # only available WITH_NUMA 'innodb_evict_tables_on_commit_debug', # one may want to override this 'innodb_use_native_aio', # default value depends on OS +'innodb_log_file_mmap', # only available on 64-bit 'innodb_log_file_buffering', # only available on Linux and Windows 'innodb_buffer_pool_load_pages_abort') # debug build only, and is only for testing order by variable_name; diff --git a/mysql-test/suite/sys_vars/t/sysvars_innodb.test b/mysql-test/suite/sys_vars/t/sysvars_innodb.test index 2680e442da4..86f5ffddf1c 100644 --- a/mysql-test/suite/sys_vars/t/sysvars_innodb.test +++ b/mysql-test/suite/sys_vars/t/sysvars_innodb.test @@ -11,6 +11,7 @@ select VARIABLE_NAME, SESSION_VALUE, DEFAULT_VALUE, VARIABLE_SCOPE, VARIABLE_TYP 'innodb_numa_interleave', # only available WITH_NUMA 'innodb_evict_tables_on_commit_debug', # one may want to override this 'innodb_use_native_aio', # default value depends on OS + 'innodb_log_file_mmap', # only available on 64-bit 'innodb_log_file_buffering', # only available on Linux and Windows 'innodb_buffer_pool_load_pages_abort') # debug build only, and is only for testing order by variable_name; diff --git a/mysys/my_getpagesize.c b/mysys/my_getpagesize.c index 62d077ccdc4..4087ead1663 100644 --- a/mysys/my_getpagesize.c +++ b/mysys/my_getpagesize.c @@ -16,8 +16,6 @@ #include "mysys_priv.h" -#ifndef HAVE_GETPAGESIZE - #if defined _WIN32 int my_getpagesize(void) @@ -27,6 +25,13 @@ int my_getpagesize(void) return si.dwPageSize; } +#elif defined _SC_PAGESIZE + +int my_getpagesize(void) +{ + return (int)sysconf(_SC_PAGESIZE); +} + #else /* Default implementation */ @@ -36,6 +41,3 @@ int my_getpagesize(void) } #endif - -#endif - diff --git a/mysys/my_init.c b/mysys/my_init.c index 44488e5848a..04c36770289 100644 --- a/mysys/my_init.c +++ b/mysys/my_init.c @@ -151,9 +151,7 @@ my_bool my_init(void) my_umask= 0660; /* Default umask for new files */ my_umask_dir= 0700; /* Default umask for new directories */ my_global_flags= 0; -#ifdef _SC_PAGESIZE - my_system_page_size= sysconf(_SC_PAGESIZE); -#endif + my_system_page_size= my_getpagesize(); /* Default creation of new files */ if ((str= getenv("UMASK")) != 0) diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index b919323f8b3..859ef0fcb4d 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -1766,7 +1766,7 @@ inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept static_assert(CPU_LEVEL1_DCACHE_LINESIZE >= 64, "efficiency"); static_assert(CPU_LEVEL1_DCACHE_LINESIZE <= 4096, "compatibility"); byte* c= my_assume_aligned - (is_pmem() ? buf + offset : checkpoint_buf); + (is_mmap() ? buf + offset : checkpoint_buf); memset_aligned(c, 0, CPU_LEVEL1_DCACHE_LINESIZE); mach_write_to_8(my_assume_aligned<8>(c), next_checkpoint_lsn); mach_write_to_8(my_assume_aligned<8>(c + 8), end_lsn); @@ -1775,8 +1775,9 @@ inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept lsn_t resizing; #ifdef HAVE_PMEM - if (is_pmem()) + if (is_mmap()) { + ut_ad(!is_opened()); resizing= resize_lsn.load(std::memory_order_relaxed); if (resizing > 1 && resizing <= next_checkpoint_lsn) @@ -1790,12 +1791,12 @@ inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept else #endif { + ut_ad(!is_mmap()); ut_ad(!checkpoint_pending); checkpoint_pending= true; latch.wr_unlock(); log_write_and_flush_prepare(); resizing= resize_lsn.load(std::memory_order_relaxed); - /* FIXME: issue an asynchronous write */ ut_ad(ut_is_2pow(write_size)); ut_ad(write_size >= 512); ut_ad(write_size <= 4096); @@ -1838,9 +1839,9 @@ inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept if (resizing > 1 && resizing <= checkpoint_lsn) { - ut_ad(is_pmem() == !resize_flush_buf); + ut_ad(is_mmap() == !resize_flush_buf); - if (!is_pmem()) + if (!is_mmap()) { if (srv_file_flush_method != SRV_O_DSYNC) ut_a(resize_log.flush()); @@ -1849,13 +1850,17 @@ inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept if (resize_rename()) { - /* Resizing failed. Discard the log_sys.resize_log. */ + /* Resizing failed. Discard the ib_logfile101. */ #ifdef HAVE_PMEM - if (is_pmem()) + if (is_mmap()) + { + ut_ad(!is_opened()); my_munmap(resize_buf, resize_target); + } else #endif { + ut_ad(!is_mmap()); ut_free_dodump(resize_buf, buf_size); ut_free_dodump(resize_flush_buf, buf_size); #ifdef _WIN32 @@ -1873,8 +1878,9 @@ inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept { /* Adopt the resized log. */ #ifdef HAVE_PMEM - if (is_pmem()) + if (is_mmap()) { + ut_ad(!is_opened()); my_munmap(buf, file_size); buf= resize_buf; set_buf_free(START_OFFSET + (get_lsn() - resizing)); @@ -1882,6 +1888,7 @@ inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept else #endif { + ut_ad(!is_mmap()); IF_WIN(,log.close()); std::swap(log, resize_log); ut_free_dodump(buf, buf_size); diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index d35889c4277..9acbd538d78 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -18532,7 +18532,10 @@ static void innodb_log_file_size_update(THD *thd, st_mysql_sys_var*, if (high_level_read_only) ib_senderrf(thd, IB_LOG_LEVEL_ERROR, ER_READ_ONLY_MODE); - else if (!log_sys.is_pmem() && + else if ( +#ifdef HAVE_PMEM + !log_sys.is_mmap() && +#endif *static_cast(save) < log_sys.buf_size) my_printf_error(ER_WRONG_ARGUMENTS, "innodb_log_file_size must be at least" @@ -18573,7 +18576,7 @@ static void innodb_log_file_size_update(THD *thd, st_mysql_sys_var*, mysql_mutex_unlock(&buf_pool.flush_list_mutex); if (start > log_sys.get_lsn()) { - ut_ad(!log_sys.is_pmem()); + ut_ad(!log_sys.is_mmap()); /* The server is almost idle. Write dummy FILE_CHECKPOINT records to ensure that the log resizing will complete. */ log_sys.latch.wr_lock(SRW_LOCK_CALL); @@ -19437,6 +19440,19 @@ static MYSQL_SYSVAR_UINT(log_buffer_size, log_sys.buf_size, "Redo log buffer size in bytes.", NULL, NULL, 16U << 20, 2U << 20, log_sys.buf_size_max, 4096); +#ifdef HAVE_INNODB_MMAP + static constexpr const char *innodb_log_file_mmap_description= + "Whether ib_logfile0" +# ifdef HAVE_PMEM + " resides in persistent memory or" +# endif + " should initially be memory-mapped"; +static MYSQL_SYSVAR_BOOL(log_file_mmap, log_sys.log_mmap, + PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY, + innodb_log_file_mmap_description, + nullptr, nullptr, log_sys.log_mmap_default); +#endif + #if defined __linux__ || defined _WIN32 static MYSQL_SYSVAR_BOOL(log_file_buffering, log_sys.log_buffered, PLUGIN_VAR_OPCMDARG, @@ -19922,6 +19938,9 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(deadlock_report), MYSQL_SYSVAR(page_size), MYSQL_SYSVAR(log_buffer_size), +#ifdef HAVE_INNODB_MMAP + MYSQL_SYSVAR(log_file_mmap), +#endif #if defined __linux__ || defined _WIN32 MYSQL_SYSVAR(log_file_buffering), #endif diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h index cb45f931262..90261050017 100644 --- a/storage/innobase/include/log0log.h +++ b/storage/innobase/include/log0log.h @@ -118,15 +118,14 @@ public: @return file size in bytes @retval 0 if not readable */ os_offset_t open(bool read_only) noexcept; + + /** @return whether a handle to the log is open */ bool is_opened() const noexcept { return m_file != OS_FILE_CLOSED; } dberr_t close() noexcept; dberr_t read(os_offset_t offset, span buf) noexcept; void write(os_offset_t offset, span buf) noexcept; bool flush() const noexcept { return os_file_flush(m_file); } -#ifdef HAVE_PMEM - byte *mmap(bool read_only, const struct stat &st) noexcept; -#endif }; /** Redo log buffer */ @@ -189,7 +188,7 @@ private: public: /** number of append_prepare_wait(); protected by lock_lsn() or lsn_lock */ size_t waits; - /** innodb_log_buffer_size (size of buf,flush_buf if !is_pmem(), in bytes) */ + /** innodb_log_buffer_size (size of buf,flush_buf if !is_mmap(), in bytes) */ unsigned buf_size; /** log file size in bytes, including the header */ lsn_t file_size; @@ -231,7 +230,7 @@ public: /** Last written LSN */ lsn_t write_lsn; - /** buffer for writing data to ib_logfile0, or nullptr if is_pmem() + /** Buffer for writing data to ib_logfile0, or nullptr if is_mmap(). In write_buf(), buf and flush_buf may be swapped */ byte *flush_buf; @@ -280,6 +279,19 @@ public: uint write_size; /** format of the redo log: e.g., FORMAT_10_8 */ uint32_t format; +#ifdef HAVE_INNODB_MMAP + /** whether the memory-mapped interface is enabled for the log */ + my_bool log_mmap; + /** the default value of log_mmap */ + static constexpr bool log_mmap_default= +# if defined __linux__ /* MAP_POPULATE would enable read-ahead */ + true || +# elif defined __FreeBSD__ /* MAP_PREFAULT_READ would enable read-ahead */ + true || +# else /* an unnecessary read-ahead of a large ib_logfile0 is a risk */ +# endif + false; +#endif #if defined __linux__ || defined _WIN32 /** whether file system caching is enabled for the log */ my_bool log_buffered; @@ -322,7 +334,7 @@ public: /** whether there is capacity in the log buffer */ bool buf_free_ok() const noexcept { - ut_ad(!is_pmem()); + ut_ad(!is_mmap()); return (buf_free.load(std::memory_order_relaxed) & ~buf_free_LOCK) < max_buf_free; } @@ -332,12 +344,14 @@ public: void set_buf_free(size_t f) noexcept { ut_ad(f < buf_free_LOCK); buf_free.store(f, std::memory_order_relaxed); } -#ifdef HAVE_PMEM - bool is_pmem() const noexcept { return !flush_buf; } +#ifdef HAVE_INNODB_MMAP + bool is_mmap() const noexcept { return !flush_buf; } #else - static constexpr bool is_pmem() { return false; } + static constexpr bool is_mmap() { return false; } #endif + /** @return whether a handle to the log is open; + is_mmap() && !is_opened() holds for PMEM */ bool is_opened() const noexcept { return log.is_opened(); } /** @return target write LSN to react on !buf_free_ok() */ @@ -381,40 +395,33 @@ public: @return whether an error occurred */ static bool resize_rename() noexcept; -#ifdef HAVE_PMEM /** @return pointer for writing to resize_buf - @retval nullptr if no PMEM based resizing is active */ + @retval nullptr if no is_mmap() based resizing is active */ inline byte *resize_buf_begin(lsn_t lsn) const noexcept; /** @return end of resize_buf */ inline const byte *resize_buf_end() const noexcept { return resize_buf + resize_target; } /** Initialise the redo log subsystem. */ - void create_low(); - /** Initialise the redo log subsystem. - @return whether the initialisation succeeded */ - bool create() { create_low(); return true; } + void create(); /** Attach a log file. @return whether the memory allocation succeeded */ bool attach(log_file_t file, os_offset_t size); -#else - /** Initialise the redo log subsystem. - @return whether the initialisation succeeded */ - bool create(); - /** Attach a log file. */ - void attach_low(log_file_t file, os_offset_t size); - bool attach(log_file_t file, os_offset_t size) - { attach_low(file, size); return true; } -#endif +#ifdef HAVE_INNODB_MMAP + /** Disable memory-mapped access (update log_mmap) */ + void clear_mmap(); + void close_file(bool really_close= true); +#else + static void clear_mmap() {} + void close_file(); +#endif #if defined __linux__ || defined _WIN32 /** Try to enable or disable file system caching (update log_buffered) */ void set_buffered(bool buffered); #endif - void close_file(); - /** Calculate the checkpoint safety margins. */ static void set_capacity(); @@ -494,11 +501,11 @@ private: public: /** Reserve space in the log buffer for appending data. @tparam spin whether to use the spin-only lock_lsn() - @tparam pmem log_sys.is_pmem() + @tparam mmap log_sys.is_mmap() @param size total length of the data to append(), in bytes @param ex whether log_sys.latch is exclusively locked @return the start LSN and the buffer position for append() */ - template + template std::pair append_prepare(size_t size, bool ex) noexcept; /** Append a string of bytes to the redo log. diff --git a/storage/innobase/include/log0recv.h b/storage/innobase/include/log0recv.h index a73b727991c..6cf79c857e4 100644 --- a/storage/innobase/include/log0recv.h +++ b/storage/innobase/include/log0recv.h @@ -408,19 +408,18 @@ private: ATTRIBUTE_COLD void report_progress() const; public: /** Parse and register one log_t::FORMAT_10_8 mini-transaction, - handling log_sys.is_pmem() buffer wrap-around. + without handling any log_sys.is_mmap() buffer wrap-around. @tparam store whether to store the records @param if_exists if store: whether to check if the tablespace exists */ template static parse_mtr_result parse_mtr(bool if_exists) noexcept; - /** Parse and register one log_t::FORMAT_10_8 mini-transaction, - handling log_sys.is_pmem() buffer wrap-around. + handling log_sys.is_mmap() buffer wrap-around. @tparam store whether to store the records @param if_exists if store: whether to check if the tablespace exists */ template - static parse_mtr_result parse_pmem(bool if_exists) noexcept -#ifdef HAVE_PMEM + static parse_mtr_result parse_mmap(bool if_exists) noexcept +#ifdef HAVE_INNODB_MMAP ; #else { return parse_mtr(if_exists); } diff --git a/storage/innobase/include/mtr0mtr.h b/storage/innobase/include/mtr0mtr.h index bfa66216184..79211326030 100644 --- a/storage/innobase/include/mtr0mtr.h +++ b/storage/innobase/include/mtr0mtr.h @@ -696,7 +696,7 @@ private: ATTRIBUTE_NOINLINE void encrypt(); /** Commit the mini-transaction log. - @tparam pmem log_sys.is_pmem() + @tparam pmem log_sys.is_mmap() @param mtr mini-transaction @param lsns {start_lsn,flush_ahead} */ template @@ -708,11 +708,11 @@ private: /** Append the redo log records to the redo log buffer. @tparam spin whether to use the spin-only log_sys.lock_lsn() - @tparam pmem log_sys.is_pmem() + @tparam mmap log_sys.is_mmap() @param mtr mini-transaction @param len number of bytes to write @return {start_lsn,flush_ahead} */ - template static + template static std::pair finish_writer(mtr_t *mtr, size_t len); /** The applicable variant of commit_log() */ diff --git a/storage/innobase/include/univ.i b/storage/innobase/include/univ.i index 1b4f70b683b..70a136273ee 100644 --- a/storage/innobase/include/univ.i +++ b/storage/innobase/include/univ.i @@ -170,6 +170,9 @@ using the call command. */ #define UNIV_INLINE static inline #define UNIV_WORD_SIZE SIZEOF_SIZE_T +#if SIZEOF_SIZE_T == 8 +# define HAVE_INNODB_MMAP +#endif /** The following alignment is used in memory allocations in memory heap management to ensure correct alignment for doubles etc. */ @@ -199,7 +202,7 @@ and 2 bits for flags. This limits the uncompressed page size to 16k. /* Define the Min, Max, Default page sizes. */ /** Minimum Page Size Shift (power of 2) */ #define UNIV_PAGE_SIZE_SHIFT_MIN 12U -/** log2 of largest page size (1<<16 == 64436 bytes). */ +/** log2 of largest page size (1<<16 == 65536 bytes). */ /** Maximum Page Size Shift (power of 2) */ #define UNIV_PAGE_SIZE_SHIFT_MAX 16U /** log2 of default page size (1<<14 == 16384 bytes). */ diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc index 4ec43a81531..dde2b3f41a9 100644 --- a/storage/innobase/log/log0log.cc +++ b/storage/innobase/log/log0log.cc @@ -86,11 +86,7 @@ void log_t::set_capacity() log_sys.max_checkpoint_age = margin; } -#ifdef HAVE_PMEM -void log_t::create_low() -#else -bool log_t::create() -#endif +void log_t::create() { ut_ad(this == &log_sys); ut_ad(!is_initialised()); @@ -101,35 +97,10 @@ bool log_t::create() need_checkpoint.store(true, std::memory_order_relaxed); write_lsn= FIRST_LSN; -#ifndef HAVE_PMEM - buf= static_cast(ut_malloc_dontdump(buf_size, PSI_INSTRUMENT_ME)); - if (!buf) - { - alloc_fail: - sql_print_error("InnoDB: Cannot allocate memory;" - " too large innodb_log_buffer_size?"); - return false; - } - flush_buf= static_cast(ut_malloc_dontdump(buf_size, - PSI_INSTRUMENT_ME)); - if (!flush_buf) - { - ut_free_dodump(buf, buf_size); - buf= nullptr; - goto alloc_fail; - } - - TRASH_ALLOC(buf, buf_size); - TRASH_ALLOC(flush_buf, buf_size); - checkpoint_buf= static_cast(aligned_malloc(4096, 4096)); - memset_aligned<4096>(checkpoint_buf, 0, 4096); - max_buf_free= buf_size / LOG_BUF_FLUSH_RATIO - LOG_BUF_FLUSH_MARGIN; -#else ut_ad(!checkpoint_buf); ut_ad(!buf); ut_ad(!flush_buf); max_buf_free= 1; -#endif latch.SRW_LOCK_INIT(log_latch_key); lsn_lock.init(); @@ -144,9 +115,6 @@ bool log_t::create() set_buf_free(0); ut_ad(is_initialised()); -#ifndef HAVE_PMEM - return true; -#endif } dberr_t log_file_t::close() noexcept @@ -178,22 +146,91 @@ void log_file_t::write(os_offset_t offset, span buf) noexcept << IF_WIN(GetLastError(), errno) << "."; } -#ifdef HAVE_PMEM -# include "cache.h" +#ifdef HAVE_INNODB_MMAP +# ifdef HAVE_PMEM +# include "cache.h" +# endif /** Attempt to memory map a file. @param file log file handle @param size file size @return pointer to memory mapping @retval MAP_FAILED if the memory cannot be mapped */ -static void *log_mmap(os_file_t file, os_offset_t size) +static void *log_mmap(os_file_t file, +# ifdef HAVE_PMEM + bool &is_pmem, /*!< whether the file is on pmem */ +# endif + os_offset_t size) { - void *ptr= - my_mmap(0, size_t(size), - srv_read_only_mode ? PROT_READ : PROT_READ | PROT_WRITE, - MAP_SHARED_VALIDATE | MAP_SYNC, file, 0); -#ifdef __linux__ - if (ptr == MAP_FAILED) + if (my_system_page_size > 4096) + return MAP_FAILED; +# ifndef HAVE_PMEM + if (!log_sys.log_mmap) + /* If support for persistent memory (Linux: mount -o dax) is enabled, + we always attempt to open a MAP_SYNC memory mapping to ib_logfile0. + This mapping will be read-only during crash recovery, and read-write + during normal operation. + + A regular read-only memory mapping may be attempted if + innodb_log_file_mmap=ON. This may benefit mariadb-backup + and crash recovery. */ + return MAP_FAILED; +# endif + + /* For now, InnoDB does not support memory-mapped writes to + a regular log file. + + If PMEM is supported, the initially attempted memory mapping may + be read-write, but the fallback will be read-only. + + The mapping will always be read-only if innodb_read_only=ON or + if mariadb-backup is running in any other mode than --prepare --export. */ + const bool read_only= + srv_read_only_mode || srv_operation >= SRV_OPERATION_BACKUP; + +# ifdef _WIN32 + void *ptr= MAP_FAILED; + if (!read_only); + else if (HANDLE h= + CreateFileMappingA(file, nullptr, PAGE_READONLY, + DWORD(size >> 32), DWORD(size), nullptr)) + { + if (h != INVALID_HANDLE_VALUE) + { + ptr= MapViewOfFileEx(h, FILE_MAP_READ, 0, 0, size, nullptr); + CloseHandle(h); + if (!ptr) + ptr= MAP_FAILED; + } + } +# else + int flags= +# ifdef HAVE_PMEM + MAP_SHARED_VALIDATE | MAP_SYNC, +# else + MAP_SHARED, +# endif + prot= PROT_READ; + + if (!read_only) +# ifdef HAVE_PMEM + prot= PROT_READ | PROT_WRITE; +# else + return MAP_FAILED; +# endif + + void *ptr= my_mmap(0, size_t(size), prot, flags, file, 0); + +# ifdef HAVE_PMEM + is_pmem= ptr != MAP_FAILED; +# endif + + if (ptr != MAP_FAILED) + return ptr; + +# ifdef HAVE_PMEM +# ifdef __linux__ /* On Linux, we pretend that /dev/shm is PMEM */ + if (srv_operation < SRV_OPERATION_BACKUP) { struct stat st; if (!fstat(file, &st)) @@ -203,46 +240,82 @@ static void *log_mmap(os_file_t file, os_offset_t size) if (!stat("/dev/shm", &st)) { MSAN_STAT_WORKAROUND(&st); - if (st.st_dev == st_dev) - ptr= my_mmap(0, size_t(size), - srv_read_only_mode ? PROT_READ : PROT_READ | PROT_WRITE, - MAP_SHARED, file, 0); + is_pmem= st.st_dev == st_dev; + if (!is_pmem) + return ptr; /* MAP_FAILED */ } } } -#endif /* __linux__ */ +# endif /* __linux__ */ + if (read_only && log_sys.log_mmap) + ptr= my_mmap(0, size_t(size), PROT_READ, MAP_SHARED, file, 0); +# endif /* HAVE_PMEM */ +# endif return ptr; } #endif -#ifdef HAVE_PMEM -bool log_t::attach(log_file_t file, os_offset_t size) +#if defined __linux__ || defined _WIN32 +/** Display a message about opening the log */ +ATTRIBUTE_COLD static void log_file_message() +{ + sql_print_information("InnoDB: %s (block size=%u bytes)", +# ifdef HAVE_INNODB_MMAP + log_sys.log_mmap + ? (log_sys.log_buffered + ? "Memory-mapped log" + : "Memory-mapped unbuffered log") + : +# endif + log_sys.log_buffered + ? "Buffered log writes" + : "File system buffers for log disabled", + log_sys.write_size); +} #else -void log_t::attach_low(log_file_t file, os_offset_t size) +static inline void log_file_message() {} #endif + +bool log_t::attach(log_file_t file, os_offset_t size) { log= file; ut_ad(!size || size >= START_OFFSET + SIZE_OF_FILE_CHECKPOINT); file_size= size; -#ifdef HAVE_PMEM ut_ad(!buf); ut_ad(!flush_buf); - if (size && !(size_t(size) & 4095) && srv_operation != SRV_OPERATION_BACKUP) +#ifdef HAVE_INNODB_MMAP + if (size) { - void *ptr= log_mmap(log.m_file, size); +# ifdef HAVE_PMEM + bool is_pmem; + void *ptr= ::log_mmap(log.m_file, is_pmem, size); +# else + void *ptr= ::log_mmap(log.m_file, size); +# endif if (ptr != MAP_FAILED) { - log.close(); - mprotect(ptr, size_t(size), PROT_READ); +# ifdef HAVE_PMEM + if (is_pmem) + { + log.close(); + log_buffered= false; + log_maybe_unbuffered= true; + IF_WIN(,mprotect(ptr, size_t(size), PROT_READ)); + } +# endif buf= static_cast(ptr); max_buf_free= 1; - log_maybe_unbuffered= true; - log_buffered= false; mtr_t::finisher_update(); - return true; +# ifdef HAVE_PMEM + if (is_pmem) + return true; +# endif + goto func_exit; } } + log_mmap= false; +#endif buf= static_cast(ut_malloc_dontdump(buf_size, PSI_INSTRUMENT_ME)); if (!buf) { @@ -256,33 +329,34 @@ void log_t::attach_low(log_file_t file, os_offset_t size) PSI_INSTRUMENT_ME)); if (!flush_buf) { + alloc_fail2: ut_free_dodump(buf, buf_size); buf= nullptr; goto alloc_fail; } - TRASH_ALLOC(buf, buf_size); - TRASH_ALLOC(flush_buf, buf_size); - max_buf_free= buf_size / LOG_BUF_FLUSH_RATIO - LOG_BUF_FLUSH_MARGIN; -#endif - -#if defined __linux__ || defined _WIN32 - sql_print_information("InnoDB: %s (block size=%u bytes)", - log_buffered - ? "Buffered log writes" - : "File system buffers for log disabled", - write_size); -#endif - - mtr_t::finisher_update(); -#ifdef HAVE_PMEM ut_ad(ut_is_2pow(write_size)); ut_ad(write_size >= 512); ut_ad(write_size <= 4096); checkpoint_buf= static_cast(aligned_malloc(write_size, write_size)); + if (!checkpoint_buf) + { + ut_free_dodump(flush_buf, buf_size); + flush_buf= nullptr; + goto alloc_fail2; + } + + TRASH_ALLOC(buf, buf_size); + TRASH_ALLOC(flush_buf, buf_size); + max_buf_free= buf_size / LOG_BUF_FLUSH_RATIO - LOG_BUF_FLUSH_MARGIN; + mtr_t::finisher_update(); memset_aligned<512>(checkpoint_buf, 0, write_size); - return true; + +#ifdef HAVE_INNODB_MMAP + func_exit: #endif + log_file_message(); + return true; } /** Write a log file header. @@ -325,66 +399,83 @@ void log_t::create(lsn_t lsn) noexcept last_checkpoint_lsn= 0; -#ifdef HAVE_PMEM - if (is_pmem()) - { - mprotect(buf, size_t(file_size), PROT_READ | PROT_WRITE); - memset_aligned<4096>(buf, 0, 4096); - set_buf_free(START_OFFSET); - } - else -#endif - { - set_buf_free(0); - memset_aligned<4096>(flush_buf, 0, buf_size); - memset_aligned<4096>(buf, 0, buf_size); - } - - log_sys.header_write(buf, lsn, is_encrypted()); DBUG_PRINT("ib_log", ("write header " LSN_PF, lsn)); #ifdef HAVE_PMEM - if (is_pmem()) + if (is_mmap()) + { + ut_ad(!is_opened()); + mprotect(buf, size_t(file_size), PROT_READ | PROT_WRITE); + memset_aligned<4096>(buf, 0, 4096); + log_sys.header_write(buf, lsn, is_encrypted()); + set_buf_free(START_OFFSET); pmem_persist(buf, 512); + } else #endif { + ut_ad(!is_mmap()); + set_buf_free(0); + memset_aligned<4096>(flush_buf, 0, buf_size); + memset_aligned<4096>(buf, 0, buf_size); + log_sys.header_write(buf, lsn, is_encrypted()); log.write(0, {buf, 4096}); memset_aligned<512>(buf, 0, 512); } } -void log_t::close_file() +ATTRIBUTE_COLD static void log_close_failed(dberr_t err) { -#ifdef HAVE_PMEM - if (is_pmem()) + ib::fatal() << "closing ib_logfile0 failed: " << err; +} + +#ifdef HAVE_INNODB_MMAP +void log_t::close_file(bool really_close) +#else +void log_t::close_file() +#endif +{ +#ifdef HAVE_INNODB_MMAP + if (is_mmap()) { - ut_ad(!is_opened()); ut_ad(!checkpoint_buf); + ut_ad(!flush_buf); if (buf) { my_munmap(buf, file_size); buf= nullptr; } - return; + } + else +#endif + { + ut_ad(!buf == !flush_buf); + ut_ad(!buf == !checkpoint_buf); + if (buf) + { + ut_free_dodump(buf, buf_size); + buf= nullptr; + ut_free_dodump(flush_buf, buf_size); + flush_buf= nullptr; + } + aligned_free(checkpoint_buf); + checkpoint_buf= nullptr; } - ut_free_dodump(buf, buf_size); - buf= nullptr; - ut_free_dodump(flush_buf, buf_size); - flush_buf= nullptr; - aligned_free(checkpoint_buf); - checkpoint_buf= nullptr; +#ifdef HAVE_INNODB_MMAP + if (really_close) #endif - if (is_opened()) - if (const dberr_t err= log.close()) - ib::fatal() << "closing ib_logfile0 failed: " << err; + if (is_opened()) + if (const dberr_t err= log.close()) + log_close_failed(err); } /** Acquire all latches that protect the log. */ static void log_resize_acquire() { - if (!log_sys.is_pmem()) +#ifdef HAVE_PMEM + if (!log_sys.is_mmap()) +#endif { while (flush_lock.acquire(log_sys.get_lsn() + 1, nullptr) != group_commit_lock::ACQUIRED); @@ -400,7 +491,9 @@ void log_resize_release() { log_sys.latch.wr_unlock(); - if (!log_sys.is_pmem()) +#ifdef HAVE_PMEM + if (!log_sys.is_mmap()) +#endif { lsn_t lsn1= write_lock.release(write_lock.value()); lsn_t lsn2= flush_lock.release(flush_lock.value()); @@ -413,13 +506,17 @@ void log_resize_release() /** Try to enable or disable file system caching (update log_buffered) */ void log_t::set_buffered(bool buffered) { - if (!log_maybe_unbuffered || is_pmem() || high_level_read_only) + if (!log_maybe_unbuffered || +#ifdef HAVE_PMEM + is_mmap() || +#endif + high_level_read_only) return; log_resize_acquire(); if (!resize_in_progress() && is_opened() && bool(log_buffered) != buffered) { - os_file_close_func(log.m_file); - log.m_file= OS_FILE_CLOSED; + if (const dberr_t err= log.close()) + log_close_failed(err); std::string path{get_log_file_path()}; log_buffered= buffered; bool success; @@ -427,11 +524,7 @@ void log_t::set_buffered(bool buffered) OS_FILE_OPEN, OS_FILE_NORMAL, OS_LOG_FILE, false, &success); ut_a(log.m_file != OS_FILE_CLOSED); - sql_print_information("InnoDB: %s (block size=%u bytes)", - log_buffered - ? "Buffered log writes" - : "File system buffers for log disabled", - write_size); + log_file_message(); } log_resize_release(); } @@ -450,6 +543,9 @@ log_t::resize_start_status log_t::resize_start(os_offset_t size) noexcept resize_start_status status= RESIZE_NO_CHANGE; lsn_t start_lsn{0}; +#ifdef HAVE_PMEM + bool is_pmem{false}; +#endif if (resize_in_progress()) status= RESIZE_IN_PROGRESS; @@ -475,10 +571,15 @@ log_t::resize_start_status log_t::resize_start(os_offset_t size) noexcept void *ptr= nullptr, *ptr2= nullptr; success= os_file_set_size(path.c_str(), resize_log.m_file, size); if (!success); -#ifdef HAVE_PMEM - else if (is_pmem()) +#ifdef HAVE_INNODB_MMAP + else if (is_mmap()) { - ptr= log_mmap(resize_log.m_file, size); + ptr= ::log_mmap(resize_log.m_file, +#ifdef HAVE_PMEM + is_pmem, +#endif + size); + if (ptr == MAP_FAILED) goto alloc_fail; } @@ -518,12 +619,12 @@ log_t::resize_start_status log_t::resize_start(os_offset_t size) noexcept resize_flush_buf= static_cast(ptr2); start_lsn= get_lsn(); - if (is_pmem()) - resize_log.close(); - else + if (!is_mmap()) start_lsn= first_lsn + (~lsn_t{write_size - 1} & (lsn_t{write_size - 1} + start_lsn - first_lsn)); + else if (!is_opened()) + resize_log.close(); } resize_lsn.store(start_lsn, std::memory_order_relaxed); status= success ? RESIZE_STARTED : RESIZE_FAILED; @@ -552,14 +653,13 @@ void log_t::resize_abort() noexcept if (resize_in_progress() > 1) { - if (!is_pmem()) + if (!is_mmap()) { - resize_log.close(); ut_free_dodump(resize_buf, buf_size); ut_free_dodump(resize_flush_buf, buf_size); resize_flush_buf= nullptr; } -#ifdef HAVE_PMEM +#ifdef HAVE_INNODB_MMAP else { ut_ad(!resize_log.is_opened()); @@ -568,6 +668,8 @@ void log_t::resize_abort() noexcept my_munmap(resize_buf, resize_target); } #endif + if (resize_log.is_opened()) + resize_log.close(); resize_buf= nullptr; resize_target= 0; resize_lsn.store(0, std::memory_order_relaxed); @@ -732,7 +834,7 @@ static size_t log_pad(lsn_t lsn, size_t pad, byte *begin, byte *extra) #ifdef HAVE_PMEM void log_t::persist(lsn_t lsn, bool holding_latch) noexcept { - ut_ad(is_pmem()); + ut_ad(!is_opened()); ut_ad(!write_lock.is_owner()); ut_ad(!flush_lock.is_owner()); #ifdef LOG_LATCH_DEBUG @@ -752,12 +854,11 @@ void log_t::persist(lsn_t lsn, bool holding_latch) noexcept if (UNIV_UNLIKELY(end < start)) { - pmem_persist(log_sys.buf + start, log_sys.file_size - start); - pmem_persist(log_sys.buf + log_sys.START_OFFSET, - end - log_sys.START_OFFSET); + pmem_persist(buf + start, file_size - start); + pmem_persist(buf + START_OFFSET, end - START_OFFSET); } else - pmem_persist(log_sys.buf + start, end - start); + pmem_persist(buf + start, end - start); old= flushed_to_disk_lsn.load(std::memory_order_relaxed); @@ -818,7 +919,7 @@ void log_t::resize_write_buf(const byte *b, size_t length) noexcept template inline lsn_t log_t::write_buf() noexcept { ut_ad(latch_have_wr()); - ut_ad(!is_pmem()); + ut_ad(!is_mmap()); ut_ad(!srv_read_only_mode); const lsn_t lsn{get_lsn(std::memory_order_relaxed)}; @@ -941,7 +1042,7 @@ bool log_t::flush(lsn_t lsn) noexcept */ static lsn_t log_flush(lsn_t lsn) { - ut_ad(!log_sys.is_pmem()); + ut_ad(!log_sys.is_mmap()); ut_a(log_sys.flush(lsn)); DBUG_EXECUTE_IF("crash_after_log_write_upto", DBUG_SUICIDE();); return flush_lock.release(lsn); @@ -961,6 +1062,7 @@ void log_write_up_to(lsn_t lsn, bool durable, ut_ad(!srv_read_only_mode || log_sys.buf_free_ok()); ut_ad(lsn != LSN_MAX); ut_ad(lsn != 0); + ut_ad(!log_sys.is_mmap() || !callback || durable); if (UNIV_UNLIKELY(recv_no_ibuf_operations)) { @@ -973,21 +1075,25 @@ void log_write_up_to(lsn_t lsn, bool durable, ut_ad(lsn <= log_sys.get_lsn()); #ifdef HAVE_PMEM - if (log_sys.is_pmem()) + if (log_sys.is_mmap()) { - ut_ad(!callback); if (durable) log_sys.persist(lsn, false); return; } #endif + ut_ad(!log_sys.is_mmap()); repeat: if (durable) { if (flush_lock.acquire(lsn, callback) != group_commit_lock::ACQUIRED) return; - flush_lock.set_pending(log_sys.get_lsn()); + /* Promise to other concurrent flush_lock.acquire() that we + will durable at least up to the current LSN. The LSN may still + advance until we acquire log_sys.latch below. */ + lsn= log_sys.get_lsn(); + flush_lock.set_pending(lsn); } lsn_t pending_write_lsn= 0, pending_flush_lsn= 0; @@ -1023,8 +1129,10 @@ void log_buffer_flush_to_disk(bool durable) /** Prepare to invoke log_write_and_flush(), before acquiring log_sys.latch. */ ATTRIBUTE_COLD void log_write_and_flush_prepare() { - if (log_sys.is_pmem()) +#ifdef HAVE_PMEM + if (log_sys.is_mmap()) return; +#endif while (flush_lock.acquire(log_sys.get_lsn() + 1, nullptr) != group_commit_lock::ACQUIRED); @@ -1032,20 +1140,56 @@ ATTRIBUTE_COLD void log_write_and_flush_prepare() group_commit_lock::ACQUIRED); } +#ifdef HAVE_INNODB_MMAP +void log_t::clear_mmap() +{ + if (!is_mmap() || +#ifdef HAVE_PMEM + !is_opened() || +#endif + high_level_read_only) + return; + log_resize_acquire(); + ut_ad(!resize_in_progress()); + ut_ad(write_lsn == get_lsn()); + ut_ad(write_lsn == get_flushed_lsn(std::memory_order_relaxed)); + + if (buf) /* this may be invoked while creating a new database */ + { + alignas(16) byte log_block[4096]; + const size_t bs{write_size}; + const size_t bf{buf_free.load(std::memory_order_relaxed)}; + { + byte *const b= buf; + memcpy_aligned<16>(log_block, b + (bf & ~(bs - 1)), bs); + } + + close_file(false); + log_mmap= false; + ut_a(attach(log, file_size)); + ut_ad(!is_mmap()); + + set_buf_free(bf & (bs - 1)); + memcpy_aligned<16>(log_sys.buf, log_block, bs); + } + log_resize_release(); +} +#endif + /** Durably write the log up to log_sys.get_lsn(). */ ATTRIBUTE_COLD void log_write_and_flush() { ut_ad(!srv_read_only_mode); - if (!log_sys.is_pmem()) +#ifdef HAVE_PMEM + if (log_sys.is_mmap()) + log_sys.persist(log_sys.get_lsn(), true); + else +#endif { const lsn_t lsn{log_sys.write_buf()}; write_lock.release(lsn); log_flush(lsn); } -#ifdef HAVE_PMEM - else - log_sys.persist(log_sys.get_lsn(), true); -#endif } /****************************************************************//** @@ -1323,18 +1467,9 @@ void log_t::close() if (!is_initialised()) return; close_file(); -#ifndef HAVE_PMEM - ut_free_dodump(buf, buf_size); - buf= nullptr; - ut_free_dodump(flush_buf, buf_size); - flush_buf= nullptr; - aligned_free(checkpoint_buf); - checkpoint_buf= nullptr; -#else ut_ad(!checkpoint_buf); ut_ad(!buf); ut_ad(!flush_buf); -#endif latch.destroy(); lsn_lock.destroy(); diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc index 2b70501dc11..019001dd37f 100644 --- a/storage/innobase/log/log0recv.cc +++ b/storage/innobase/log/log0recv.cc @@ -1480,6 +1480,7 @@ void recv_sys_t::debug_free() pages_it= pages.end(); mysql_mutex_unlock(&mutex); + log_sys.clear_mmap(); } @@ -1632,7 +1633,7 @@ ATTRIBUTE_COLD static dberr_t recv_log_recover_pre_10_2() byte *buf= const_cast(field_ref_zero); - if (source_offset < (log_sys.is_pmem() ? log_sys.file_size : 4096)) + if (source_offset < (log_sys.is_mmap() ? log_sys.file_size : 4096)) memcpy_aligned<512>(buf, &log_sys.buf[source_offset & ~511], 512); else if (dberr_t err= recv_sys.read(source_offset & ~511, {buf, 512})) @@ -1671,7 +1672,7 @@ static dberr_t recv_log_recover_10_5(lsn_t lsn_offset) { byte *buf= const_cast(field_ref_zero); - if (lsn_offset < (log_sys.is_pmem() ? log_sys.file_size : 4096)) + if (lsn_offset < (log_sys.is_mmap() ? log_sys.file_size : 4096)) memcpy_aligned<512>(buf, &log_sys.buf[lsn_offset & ~511], 512); else { @@ -1772,7 +1773,7 @@ dberr_t recv_sys_t::find_checkpoint() log_sys.next_checkpoint_lsn= 0; lsn= 0; buf= my_assume_aligned<4096>(log_sys.buf); - if (!log_sys.is_pmem()) + if (!log_sys.is_mmap()) if (dberr_t err= log_sys.log.read(0, {buf, log_sys.START_OFFSET})) return err; /* Check the header page checksum. There was no @@ -2210,7 +2211,7 @@ static void store_freed_or_init_rec(page_id_t page_id, bool freed) /** Wrapper for log_sys.buf[] between recv_sys.offset and recv_sys.len */ struct recv_buf { - bool is_pmem() const noexcept { return log_sys.is_pmem(); } + bool is_mmap() const noexcept { return log_sys.is_mmap(); } const byte *ptr; @@ -2301,11 +2302,11 @@ struct recv_buf } }; -#ifdef HAVE_PMEM +#ifdef HAVE_INNODB_MMAP /** Ring buffer wrapper for log_sys.buf[]; recv_sys.len == log_sys.file_size */ struct recv_ring : public recv_buf { - static constexpr bool is_pmem() { return true; } + static constexpr bool is_mmap() { return true; } constexpr recv_ring(const byte *ptr) : recv_buf(ptr) {} @@ -2598,7 +2599,7 @@ restart: ut_d(const source el{l}); lsn+= l - begin; offset= l.ptr - log_sys.buf; - if (!l.is_pmem()); + if (!l.is_mmap()); else if (offset == log_sys.file_size) offset= log_sys.START_OFFSET; else @@ -3110,12 +3111,12 @@ recv_sys_t::parse_mtr_result recv_sys_t::parse_mtr(bool if_exists) noexcept template recv_sys_t::parse_mtr_result recv_sys_t::parse_mtr(bool) noexcept; -#ifdef HAVE_PMEM +#ifdef HAVE_INNODB_MMAP template -recv_sys_t::parse_mtr_result recv_sys_t::parse_pmem(bool if_exists) noexcept +recv_sys_t::parse_mtr_result recv_sys_t::parse_mmap(bool if_exists) noexcept { recv_sys_t::parse_mtr_result r{parse_mtr(if_exists)}; - if (UNIV_LIKELY(r != PREMATURE_EOF) || !log_sys.is_pmem()) + if (UNIV_LIKELY(r != PREMATURE_EOF) || !log_sys.is_mmap()) return r; ut_ad(recv_sys.len == log_sys.file_size); ut_ad(recv_sys.offset >= log_sys.START_OFFSET); @@ -3126,6 +3127,10 @@ recv_sys_t::parse_mtr_result recv_sys_t::parse_pmem(bool if_exists) noexcept : &log_sys.buf[recv_sys.offset]}; return recv_sys.parse(s, if_exists); } + +/** for mariadb-backup; @see xtrabackup_copy_mmap_logfile() */ +template +recv_sys_t::parse_mtr_result recv_sys_t::parse_mmap(bool) noexcept; #endif /** Apply the hashed log records to the page, if the page lsn is less than the @@ -3996,7 +4001,7 @@ void recv_sys_t::apply(bool last_batch) log_sort_flush_list(); #ifdef HAVE_PMEM - if (last_batch && log_sys.is_pmem()) + if (last_batch && log_sys.is_mmap() && !log_sys.is_opened()) mprotect(log_sys.buf, len, PROT_READ | PROT_WRITE); #endif @@ -4024,15 +4029,13 @@ static bool recv_scan_log(bool last_phase) bool store{recv_sys.file_checkpoint != 0}; size_t buf_size= log_sys.buf_size; -#ifdef HAVE_PMEM - if (log_sys.is_pmem()) + if (log_sys.is_mmap()) { recv_sys.offset= size_t(log_sys.calc_lsn_offset(recv_sys.lsn)); buf_size= size_t(log_sys.file_size); recv_sys.len= size_t(log_sys.file_size); } else -#endif { recv_sys.offset= size_t(recv_sys.lsn - log_sys.get_first_lsn()) & block_size_1; @@ -4094,7 +4097,7 @@ static bool recv_scan_log(bool last_phase) for (;;) { const byte& b{log_sys.buf[recv_sys.offset]}; - r= recv_sys.parse_pmem(false); + r= recv_sys.parse_mmap(false); switch (r) { case recv_sys_t::PREMATURE_EOF: goto read_more; @@ -4124,7 +4127,7 @@ static bool recv_scan_log(bool last_phase) else { ut_ad(recv_sys.file_checkpoint != 0); - switch ((r= recv_sys.parse_pmem(false))) { + switch ((r= recv_sys.parse_mmap(false))) { case recv_sys_t::PREMATURE_EOF: goto read_more; case recv_sys_t::GOT_EOF: @@ -4146,11 +4149,11 @@ static bool recv_scan_log(bool last_phase) if (!store) skip_the_rest: - while ((r= recv_sys.parse_pmem(false)) == recv_sys_t::OK); + while ((r= recv_sys.parse_mmap(false)) == recv_sys_t::OK); else { uint16_t count= 0; - while ((r= recv_sys.parse_pmem(last_phase)) == recv_sys_t::OK) + while ((r= recv_sys.parse_mmap(last_phase)) == recv_sys_t::OK) if (!++count && recv_sys.report(time(nullptr))) { const size_t n= recv_sys.pages.size(); @@ -4189,10 +4192,9 @@ static bool recv_scan_log(bool last_phase) } read_more: -#ifdef HAVE_PMEM - if (log_sys.is_pmem()) + if (log_sys.is_mmap()) break; -#endif + if (recv_sys.is_corrupt_log()) break; @@ -4537,13 +4539,13 @@ inline void log_t::set_recovered() noexcept ut_ad(get_flushed_lsn() == get_lsn()); ut_ad(recv_sys.lsn == get_lsn()); size_t offset{recv_sys.offset}; - if (!is_pmem()) + if (!is_mmap()) { const size_t bs{log_sys.write_size}, bs_1{bs - 1}; memmove_aligned<512>(buf, buf + (offset & ~bs_1), bs); offset&= bs_1; } -#ifdef HAVE_PMEM +#ifndef _WIN32 else mprotect(buf, size_t(file_size), PROT_READ | PROT_WRITE); #endif diff --git a/storage/innobase/mtr/mtr0mtr.cc b/storage/innobase/mtr/mtr0mtr.cc index d531838b5fd..15370943a55 100644 --- a/storage/innobase/mtr/mtr0mtr.cc +++ b/storage/innobase/mtr/mtr0mtr.cc @@ -42,6 +42,7 @@ Created 11/26/1995 Heikki Tuuri #ifdef HAVE_PMEM void (*mtr_t::commit_logger)(mtr_t *, std::pair); #endif + std::pair (*mtr_t::finisher)(mtr_t *, size_t); unsigned mtr_t::spin_wait_delay; @@ -49,7 +50,7 @@ void mtr_t::finisher_update() { ut_ad(log_sys.latch_have_wr()); #ifdef HAVE_PMEM - if (log_sys.is_pmem()) + if (log_sys.is_mmap()) { commit_logger= mtr_t::commit_log; finisher= spin_wait_delay @@ -351,11 +352,11 @@ inline lsn_t log_t::get_write_target() const return write_lsn + max_buf_free / 2; } -template +template void mtr_t::commit_log(mtr_t *mtr, std::pair lsns) { size_t modified= 0; - const lsn_t write_lsn= pmem ? 0 : log_sys.get_write_target(); + const lsn_t write_lsn= mmap ? 0 : log_sys.get_write_target(); if (mtr->m_made_dirty) { @@ -475,7 +476,7 @@ void mtr_t::commit_log(mtr_t *mtr, std::pair lsns) if (UNIV_UNLIKELY(lsns.second != PAGE_FLUSH_NO)) buf_flush_ahead(mtr->m_commit_lsn, lsns.second == PAGE_FLUSH_SYNC); - if (!pmem && UNIV_UNLIKELY(write_lsn != 0)) + if (!mmap && UNIV_UNLIKELY(write_lsn != 0)) log_write_up_to(write_lsn, false); } @@ -1011,7 +1012,7 @@ ATTRIBUTE_COLD size_t log_t::append_prepare_wait(size_t b, bool ex, lsn_t lsn) else latch.rd_unlock(); - log_write_up_to(lsn, is_pmem()); + log_write_up_to(lsn, is_mmap()); if (ex) latch.wr_lock(SRW_LOCK_CALL); @@ -1027,16 +1028,16 @@ ATTRIBUTE_COLD size_t log_t::append_prepare_wait(size_t b, bool ex, lsn_t lsn) /** Reserve space in the log buffer for appending data. @tparam spin whether to use the spin-only lock_lsn() -@tparam pmem log_sys.is_pmem() +@tparam mmap log_sys.is_mmap() @param size total length of the data to append(), in bytes @param ex whether log_sys.latch is exclusively locked @return the start LSN and the buffer position for append() */ -template +template inline std::pair log_t::append_prepare(size_t size, bool ex) noexcept { ut_ad(ex ? latch_have_wr() : latch_have_rd()); - ut_ad(pmem == is_pmem()); + ut_ad(mmap == is_mmap()); if (!spin) lsn_lock.wr_lock(); size_t b{spin ? lock_lsn() : buf_free.load(std::memory_order_relaxed)}; @@ -1044,7 +1045,7 @@ std::pair log_t::append_prepare(size_t size, bool ex) noexcept lsn_t l{lsn.load(std::memory_order_relaxed)}, end_lsn{l + size}; - if (UNIV_UNLIKELY(pmem + if (UNIV_UNLIKELY(mmap ? (end_lsn - get_flushed_lsn(std::memory_order_relaxed)) > capacity() : b + size >= buf_size)) @@ -1057,7 +1058,7 @@ std::pair log_t::append_prepare(size_t size, bool ex) noexcept } size_t new_buf_free= b + size; - if (pmem && new_buf_free >= file_size) + if (mmap && new_buf_free >= file_size) new_buf_free-= size_t(capacity()); lsn.store(end_lsn, std::memory_order_relaxed); @@ -1213,10 +1214,10 @@ inline void log_t::resize_write(lsn_t lsn, const byte *end, size_t len, end-= len; size_t s; -#ifdef HAVE_PMEM +#ifdef HAVE_INNODB_MMAP if (!resize_flush_buf) { - ut_ad(is_pmem()); + ut_ad(is_mmap()); lsn_lock.wr_lock(); const size_t resize_capacity{resize_target - START_OFFSET}; { @@ -1236,7 +1237,7 @@ inline void log_t::resize_write(lsn_t lsn, const byte *end, size_t len, cannot overlap, that is, our entire log must be discarded. Besides, incomplete mini-transactions cannot be parsed anyway. */ ut_ad(resizing >= lsn + len); - goto pmem_done; + goto mmap_done; } s= START_OFFSET; @@ -1277,7 +1278,7 @@ inline void log_t::resize_write(lsn_t lsn, const byte *end, size_t len, we will advance resize_lsn. */ ut_ad(resize_buf[s] <= 1); resize_buf[s]= 1; - pmem_done: + mmap_done: lsn_lock.wr_unlock(); } else @@ -1300,12 +1301,12 @@ inline void log_t::append(byte *&d, const void *s, size_t size) noexcept { ut_ad(log_sys.latch_have_any()); ut_ad(d + size <= log_sys.buf + - (log_sys.is_pmem() ? log_sys.file_size : log_sys.buf_size)); + (log_sys.is_mmap() ? log_sys.file_size : log_sys.buf_size)); memcpy(d, s, size); d+= size; } -template +template std::pair mtr_t::finish_writer(mtr_t *mtr, size_t len) { @@ -1316,16 +1317,14 @@ mtr_t::finish_writer(mtr_t *mtr, size_t len) const size_t size{mtr->m_commit_lsn ? 5U + 8U : 5U}; std::pair start= - log_sys.append_prepare(len, mtr->m_latch_ex); + log_sys.append_prepare(len, mtr->m_latch_ex); - if (!pmem) + if (!mmap) { mtr->m_log.for_each_block([&start](const mtr_buf_t::block_t *b) { log_sys.append(start.second, b->begin(), b->used()); return true; }); -#ifdef HAVE_PMEM write_trailer: -#endif *start.second++= log_sys.get_sequence_bit(start.first + len - size); if (mtr->m_commit_lsn) { @@ -1336,7 +1335,6 @@ mtr_t::finish_writer(mtr_t *mtr, size_t len) mach_write_to_4(start.second, mtr->m_crc); start.second+= 4; } -#ifdef HAVE_PMEM else { if (UNIV_LIKELY(start.second + len <= &log_sys.buf[log_sys.file_size])) @@ -1384,9 +1382,6 @@ mtr_t::finish_writer(mtr_t *mtr, size_t len) ((size >= size_left) ? log_sys.START_OFFSET : log_sys.file_size) + (size - size_left); } -#else - static_assert(!pmem, ""); -#endif log_sys.resize_write(start.first, start.second, len, size); diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc index 124ac4b294d..56b31c12063 100644 --- a/storage/innobase/srv/srv0start.cc +++ b/storage/innobase/srv/srv0start.cc @@ -1094,7 +1094,8 @@ same_size: log_sys.latch.wr_unlock(); - log_write_up_to(flushed_lsn, false); + if (latest_format) + log_write_up_to(flushed_lsn, false); ut_ad(flushed_lsn == log_sys.get_lsn()); ut_ad(!os_aio_pending_reads()); @@ -1290,10 +1291,7 @@ dberr_t srv_start(bool create_new_db) } #endif /* UNIV_DEBUG */ - if (!log_sys.create()) { - return srv_init_abort(DB_ERROR); - } - + log_sys.create(); recv_sys.create(); lock_sys.create(srv_lock_table_size); @@ -1856,13 +1854,13 @@ skip_monitors: if (srv_print_verbose_log) { sql_print_information("InnoDB: " "log sequence number " LSN_PF -#ifdef HAVE_PMEM +#ifdef HAVE_INNODB_MMAP "%s" #endif "; transaction id " TRX_ID_FMT, recv_sys.lsn, -#ifdef HAVE_PMEM - log_sys.is_pmem() +#ifdef HAVE_INNODB_MMAP + log_sys.is_mmap() ? " (memory-mapped)" : "", #endif trx_sys.get_max_trx_id()); diff --git a/storage/innobase/trx/trx0trx.cc b/storage/innobase/trx/trx0trx.cc index 0a4845f7763..2b304868b14 100644 --- a/storage/innobase/trx/trx0trx.cc +++ b/storage/innobase/trx/trx0trx.cc @@ -1260,20 +1260,20 @@ static void trx_flush_log_if_needed(lsn_t lsn, trx_t *trx) const bool flush= (srv_file_flush_method != SRV_NOSYNC && (srv_flush_log_at_trx_commit & 1)); + if (!log_sys.is_mmap()) + { + completion_callback cb; - completion_callback cb; - if (!log_sys.is_pmem() && - (cb.m_param= thd_increment_pending_ops(trx->mysql_thd))) - { - cb.m_callback = (void (*)(void *)) thd_decrement_pending_ops; - log_write_up_to(lsn, flush, &cb); - } - else - { - trx->op_info= "flushing log"; - log_write_up_to(lsn, flush); - trx->op_info= ""; + if ((cb.m_param= thd_increment_pending_ops(trx->mysql_thd))) + { + cb.m_callback= (void (*)(void *)) thd_decrement_pending_ops; + log_write_up_to(lsn, flush, &cb); + return; + } } + trx->op_info= "flushing log"; + log_write_up_to(lsn, flush); + trx->op_info= ""; } /** Process tables that were modified by the committing transaction. */