MDEV-37244: Avoid page lookup after read

buf_read_page(): Return a pointer to a buffer-fixed, non-read-fixed page,
or nullptr in case of an error.

buf_inc_get(): Wrapper for buf_inc_get(ha_handler_stats*),
to read the thread-local variable mariadb_stats before updating it.

IORequest::read_complete(): Assert that the page is both read-fixed
and buffer-fixed. Sample recv_sys.recovery_on only once.
Buffer-unfix the page when the asynchronous read completes.

buf_page_t::read_complete(): Assert that the page is both
read-fixed and buffer-fixed.

buf_page_t::read_wait(): Wait for a read-fixed and buffer-fixed page
to be only read-fixed. Return the state and optionally page identifier
when holding a shared latch.

buf_page_init_for_read(): Return a pointer to a buffer-fixed block
descriptor pointer, bitwise-ORed with 1 in case the block already
exists in the buffer pool.

buf_read_ahead_update(), buf_read_ahead_update_sql(): Common code
for updating some statistics counters.

buf_read_page_low(): Replace the parameter sync with err, which will
return an error code to a synchronous caller. Add a parameter for
thread-local mariadb_stats.
Return the pointer to the block, or the special values nullptr
(read failure) or -1 or -2 for asynchronous reads.
Increment the statistics when a synchronous read was requested.
In a synchronous read, if the page has already been allocated in
the buffer pool but it is read-fixed, wait for the read to complete.

buf_page_get_zip(): Get a buffer-fixed page from buf_read_page(),
and unfix() it. Our caller is relying solely on a page latch.

buf_read_page_background(): Update the statistics if supplied.
This commit is contained in:
Marko Mäkelä
2025-07-18 14:43:30 +03:00
parent f760918b40
commit a027160cec
8 changed files with 378 additions and 297 deletions

View File

@ -65,6 +65,7 @@ Created 10/16/1994 Heikki Tuuri
#include "mysql/service_wsrep.h"
#endif /* WITH_WSREP */
#include "log.h"
#include "mariadb_stats.h"
/** Modification types for the B-tree operation.
Note that the order must be DELETE, BOTH, INSERT !!
@ -2257,7 +2258,7 @@ Prefetch siblings of the leaf for the pessimistic operation.
@param block leaf page
@param index index of the page */
static void btr_cur_prefetch_siblings(const buf_block_t *block,
const dict_index_t *index)
const dict_index_t *index) noexcept
{
ut_ad(page_is_leaf(block->page.frame));
@ -2266,15 +2267,23 @@ static void btr_cur_prefetch_siblings(const buf_block_t *block,
uint32_t next= mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_NEXT));
fil_space_t *space= index->table->space;
page_id_t id{space->id, 0};
if (prev == FIL_NULL);
else if (space->acquire())
buf_read_page_background(space, page_id_t(space->id, prev),
block->zip_size());
if (next == FIL_NULL);
else if (space->acquire())
buf_read_page_background(space, page_id_t(space->id, next),
block->zip_size());
ha_handler_stats *stats= mariadb_stats;
if (prev != FIL_NULL)
{
id.set_page_no(prev);
if (space->acquire())
buf_read_page_background(id, space, stats);
}
if (next != FIL_NULL)
{
id.set_page_no(next);
if (space->acquire())
buf_read_page_background(id, space, stats);
}
}
/*************************************************************//**

View File

@ -2196,17 +2196,22 @@ void buf_page_free(fil_space_t *space, uint32_t page, mtr_t *mtr)
mtr->memo_push(block, MTR_MEMO_PAGE_X_MODIFY);
}
static void buf_inc_get(ha_handler_stats *stats)
static void buf_inc_get(ha_handler_stats *stats) noexcept
{
mariadb_increment_pages_accessed(stats);
if (stats)
mariadb_increment_pages_accessed(stats);
++buf_pool.stat.n_page_gets;
}
static void buf_inc_get() noexcept
{
buf_inc_get(mariadb_stats);
}
TRANSACTIONAL_TARGET
buf_page_t *buf_page_get_zip(const page_id_t page_id) noexcept
{
ha_handler_stats *const stats= mariadb_stats;
buf_inc_get(stats);
buf_inc_get();
buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold());
page_hash_latch &hash_lock= buf_pool.page_hash.lock_get(chain);
@ -2235,29 +2240,33 @@ buf_page_t *buf_page_get_zip(const page_id_t page_id) noexcept
if (!bpage)
{
hash_lock.unlock_shared();
switch (dberr_t err= buf_read_page(page_id, chain, false)) {
case DB_SUCCESS:
case DB_SUCCESS_LOCKED_REC:
mariadb_increment_pages_read(stats);
continue;
case DB_TABLESPACE_DELETED:
return nullptr;
default:
sql_print_error("InnoDB: Reading compressed page "
"[page id: space=" UINT32PF ", page number=" UINT32PF
"] failed with error: %s",
page_id.space(), page_id.page_no(), ut_strerr(err));
dberr_t err;
bpage= &buf_read_page(page_id, &err, chain, false)->page;
if (!bpage)
{
if (err != DB_TABLESPACE_DELETED)
sql_print_error("InnoDB: Reading compressed page "
"[page id: space=" UINT32PF ", page number=" UINT32PF
"] failed with error: %s",
page_id.space(), page_id.page_no(), ut_strerr(err));
return nullptr;
}
hash_lock.lock_shared();
bpage->unfix();
}
ut_ad(bpage->in_file());
ut_d(uint32_t state= bpage->state());
ut_ad(state >= buf_page_t::UNFIXED);
ut_ad(page_id == bpage->id());
const bool got_s_latch= bpage->lock.s_lock_try();
hash_lock.unlock_shared();
if (UNIV_LIKELY(got_s_latch))
{
ut_ad(!bpage->is_read_fixed());
break;
}
/* We may fail to acquire bpage->lock because a read is holding an
exclusive latch on this block and either in progress or invoking
buf_pool_t::corrupted_evict().
@ -2481,6 +2490,34 @@ buf_block_t *buf_pool_t::unzip(buf_page_t *b, buf_pool_t::hash_chain &chain)
return block;
}
uint32_t buf_page_t::read_wait(page_id_t *page_id, ha_handler_stats *stats)
noexcept
{
ulonglong start= 0;
if (stats)
{
tpool::tpool_wait_begin();
thd_wait_begin(nullptr, THD_WAIT_DISKIO);
stats->pages_read_count++;
if (stats->active)
start= mariadb_measure();
}
lock.s_lock();
uint32_t latched_state= state();
ut_ad(latched_state > FREED);
ut_ad(latched_state < READ_FIX || latched_state > WRITE_FIX);
if (page_id)
*page_id= id();
lock.s_unlock();
if (start)
{
stats->pages_read_time+= mariadb_measure() - start;
tpool::tpool_wait_end();
thd_wait_end(nullptr);
}
return latched_state;
}
buf_block_t *buf_pool_t::page_fix(const page_id_t id,
dberr_t *err,
buf_pool_t::page_fix_conflicts c) noexcept
@ -2493,9 +2530,10 @@ buf_block_t *buf_pool_t::page_fix(const page_id_t id,
{
hash_lock.lock_shared();
buf_page_t *b= page_hash.get(id, chain);
uint32_t state;
if (b)
{
uint32_t state= b->fix() + 1;
state= b->fix() + 1;
hash_lock.unlock_shared();
if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED))
@ -2534,11 +2572,8 @@ buf_block_t *buf_pool_t::page_fix(const page_id_t id,
std::this_thread::sleep_for(std::chrono::microseconds(100));
continue;
}
b->lock.s_lock();
state= b->state();
ut_ad(state < buf_page_t::READ_FIX || state >= buf_page_t::WRITE_FIX);
b->lock.s_unlock();
state= b->read_wait(nullptr, stats);
}
if (UNIV_UNLIKELY(!b->frame))
@ -2566,17 +2601,21 @@ buf_block_t *buf_pool_t::page_fix(const page_id_t id,
if (c == FIX_NOWAIT)
return reinterpret_cast<buf_block_t*>(-1);
switch (dberr_t local_err= buf_read_page(id, chain)) {
default:
if (err)
*err= local_err;
buf_block_t *block= buf_read_page(id, err, chain);
if (!block)
return nullptr;
case DB_SUCCESS:
case DB_SUCCESS_LOCKED_REC:
mariadb_increment_pages_read(stats);
buf_read_ahead_random(id);
buf_read_ahead_random(id);
if (err)
{
ut_ad(*err == DB_SUCCESS || *err == DB_SUCCESS_LOCKED_REC);
*err= DB_SUCCESS;
}
if (UNIV_UNLIKELY(!block->page.frame))
{
b= &block->page;
goto unzip;
}
return block;
}
}
@ -2687,7 +2726,7 @@ buf_page_get_gen(
}
#endif /* UNIV_DEBUG */
ha_handler_stats* const stats = mariadb_stats;
ha_handler_stats *const stats = mariadb_stats;
buf_inc_get(stats);
auto& chain= buf_pool.page_hash.cell_get(page_id.fold());
page_hash_latch& hash_lock = buf_pool.page_hash.lock_get(chain);
@ -2717,47 +2756,25 @@ loop:
switch (mode) {
case BUF_GET_IF_IN_POOL:
case BUF_PEEK_IF_IN_POOL:
return nullptr;
}
/* The call path is buf_read_page() ->
buf_read_page_low() (fil_space_t::io()) ->
buf_page_t::read_complete() ->
buf_decrypt_after_read(). Here fil_space_t* is used
and we decrypt -> buf_page_check_corrupt() where page
checksums are compared. Decryption, decompression as
well as error handling takes place at a lower level.
Here we only need to know whether the page really is
corrupted, or if an encrypted page with a valid
checksum cannot be decypted. */
switch (dberr_t local_err = buf_read_page(page_id, chain)) {
case DB_SUCCESS:
case DB_SUCCESS_LOCKED_REC:
mariadb_increment_pages_read(stats);
buf_read_ahead_random(page_id);
break;
default:
if (mode != BUF_GET_POSSIBLY_FREED
&& retries++ < BUF_PAGE_READ_MAX_RETRIES) {
DBUG_EXECUTE_IF("intermittent_read_failure",
retries = BUF_PAGE_READ_MAX_RETRIES;);
block = buf_read_page(page_id, err, chain);
if (!block) {
break;
} else if (err) {
*err = DB_SUCCESS;
}
/* fall through */
case DB_PAGE_CORRUPTED:
if (err) {
*err = local_err;
}
return nullptr;
ut_d(if (!(++buf_dbg_counter % 5771)) buf_pool.validate());
buf_read_ahead_random(page_id);
state = block->page.state();
goto not_read_fixed;
}
ut_d(if (!(++buf_dbg_counter % 5771)) buf_pool.validate());
goto loop;
return nullptr;
got_block:
state++;
ut_ad(state > buf_page_t::FREED);
if (state > buf_page_t::READ_FIX && state < buf_page_t::WRITE_FIX) {
if (mode == BUF_PEEK_IF_IN_POOL) {
ignore_block:
@ -2778,12 +2795,8 @@ ignore_unfixed:
in buf_page_t::read_complete() or
buf_pool_t::corrupted_evict(), or
after buf_zip_decompress() in this function. */
block->page.lock.s_lock();
state = block->page.state();
ut_ad(state < buf_page_t::READ_FIX
|| state >= buf_page_t::WRITE_FIX);
const page_id_t id{block->page.id()};
block->page.lock.s_unlock();
page_id_t id{0};
state = block->page.read_wait(&id, stats);
if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED)) {
block->page.unfix();
@ -2807,12 +2820,19 @@ ignore_unfixed:
return nullptr;
}
ut_ad(id == page_id);
} else if (mode != BUF_PEEK_IF_IN_POOL) {
} else if (UNIV_UNLIKELY(!block->page.frame)) {
/* The BUF_PEEK_IF_IN_POOL mode is mainly used for dropping an
adaptive hash index. There cannot be an
adaptive hash index for a compressed-only page. */
goto ignore_block;
} else {
not_read_fixed:
ut_ad(state > buf_page_t::FREED);
ut_ad(state < buf_page_t::READ_FIX
|| state > buf_page_t::WRITE_FIX);
if (mode != BUF_PEEK_IF_IN_POOL) {
} else if (UNIV_UNLIKELY(!block->page.frame)) {
/* The BUF_PEEK_IF_IN_POOL mode is mainly used
for dropping an adaptive hash index. There
cannot be an adaptive hash index for a
compressed-only page. */
goto ignore_block;
}
}
ut_ad(mode == BUF_GET_IF_IN_POOL || mode == BUF_PEEK_IF_IN_POOL
@ -3008,7 +3028,7 @@ buf_block_t *buf_page_try_get(const page_id_t page_id, mtr_t *mtr) noexcept
ut_ad(block->page.buf_fix_count());
ut_ad(block->page.id() == page_id);
buf_inc_get(mariadb_stats);
buf_inc_get();
return block;
}
@ -3411,17 +3431,26 @@ static dberr_t buf_page_check_corrupt(buf_page_t *bpage,
/** Complete a read of a page.
@param node data file
@param recovery recv_recovery_is_on()
@return whether the operation succeeded
@retval DB_SUCCESS if the read succeeded; caller must unfix()
@retval DB_PAGE_CORRUPTED if the checksum or the page ID is incorrect
@retval DB_DECRYPTION_FAILED if the page cannot be decrypted */
dberr_t buf_page_t::read_complete(const fil_node_t &node) noexcept
dberr_t buf_page_t::read_complete(const fil_node_t &node,
bool recovery) noexcept
{
const page_id_t expected_id{id()};
ut_ad(is_read_fixed());
{
/* The block must be read-fixed and buffer-fixed. */
ut_d(const auto s= state());
ut_ad(s > READ_FIX);
ut_ad(s < WRITE_FIX);
}
ut_ad(!buf_dblwr.is_inside(id()));
ut_ad(id().space() == node.space->id);
ut_ad(zip_size() == node.space->zip_size());
ut_ad(!!zip.ssize == !!zip.data);
ut_ad(recovery == recv_sys.recovery_on);
const byte *read_frame= zip.data ? zip.data : frame;
ut_ad(read_frame);
@ -3500,8 +3529,7 @@ database_corrupted_compressed:
if (err == DB_PAGE_CORRUPTED || err == DB_DECRYPTION_FAILED)
{
release_page:
if (node.space->full_crc32() && node.space->crypt_data &&
recv_recovery_is_on() &&
if (recovery && node.space->full_crc32() && node.space->crypt_data &&
recv_sys.dblwr.find_deferred_page(node, id().page_no(),
const_cast<byte*>(read_frame)))
{
@ -3510,7 +3538,7 @@ release_page:
goto success_page;
}
if (recv_sys.free_corrupted_page(expected_id, node));
if (recovery && recv_sys.free_corrupted_page(expected_id, node));
else if (err == DB_FAIL)
err= DB_PAGE_CORRUPTED;
else
@ -3528,28 +3556,25 @@ release_page:
FORCE_RECOVERY_MSG);
}
buf_pool.corrupted_evict(this, buf_page_t::READ_FIX);
buf_pool.corrupted_evict(this, buf_page_t::READ_FIX + 1);
return err;
}
}
success_page:
const bool recovery= frame && recv_recovery_is_on();
if (recovery && !recv_recover_page(node.space, this))
if (!recovery || !frame)
{
ut_d(auto f=) zip.fix.fetch_sub(READ_FIX - UNFIXED);
ut_ad(f > READ_FIX);
ut_ad(f < WRITE_FIX);
}
else if (!recv_recover_page(node.space, this))
return DB_PAGE_CORRUPTED;
if (UNIV_UNLIKELY(MONITOR_IS_ON(MONITOR_MODULE_BUF_PAGE)))
buf_page_monitor(*this, true);
DBUG_PRINT("ib_buf", ("read page %u:%u", id().space(), id().page_no()));
if (!recovery)
{
ut_d(auto f=) zip.fix.fetch_sub(READ_FIX - UNFIXED);
ut_ad(f >= READ_FIX);
ut_ad(f < WRITE_FIX);
}
lock.x_unlock(true);
return DB_SUCCESS;

View File

@ -585,7 +585,6 @@ buf_load()
so all pages from a given tablespace are consecutive. */
uint32_t cur_space_id = dump[0].space();
fil_space_t* space = fil_space_t::get(cur_space_id);
ulint zip_size = space ? space->zip_size() : 0;
PSI_stage_progress* pfs_stage_progress __attribute__((unused))
= mysql_set_stage(srv_stage_buffer_pool_load.m_key);
@ -608,12 +607,6 @@ buf_load()
cur_space_id = this_space_id;
space = fil_space_t::get(cur_space_id);
if (!space) {
continue;
}
zip_size = space->zip_size();
}
/* JAN: TODO: As we use background page read below,
@ -632,7 +625,7 @@ buf_load()
}
space->reacquire();
buf_read_page_background(space, dump[i], zip_size);
buf_read_page_background(dump[i], space, nullptr);
if (buf_load_abort_flag) {
if (space) {

View File

@ -71,18 +71,20 @@ and the lock released later.
@param chain buf_pool.page_hash cell for page_id
@param block preallocated buffer block (set to nullptr if consumed)
@return pointer to the block
@retval nullptr in case of an error */
@retval nullptr in case of an error
@retval pointer to block | 1 if the page already exists in buf pool */
TRANSACTIONAL_TARGET
static buf_page_t *buf_page_init_for_read(const page_id_t page_id,
ulint zip_size,
buf_pool_t::hash_chain &chain,
buf_block_t *&block)
buf_block_t *&block) noexcept
{
buf_page_t *bpage= nullptr;
constexpr uint32_t READ_BUF_FIX{buf_page_t::READ_FIX + 1};
if (!zip_size || (zip_size & 1))
{
bpage= &block->page;
block->initialise(page_id, zip_size & ~1, buf_page_t::READ_FIX);
block->initialise(page_id, zip_size & ~1, READ_BUF_FIX);
/* x_unlock() will be invoked
in buf_page_t::read_complete() by the io-handler thread. */
block->page.lock.x_lock(true);
@ -90,11 +92,14 @@ static buf_page_t *buf_page_init_for_read(const page_id_t page_id,
page_hash_latch &hash_lock= buf_pool.page_hash.lock_get(chain);
hash_lock.lock();
if (buf_pool.page_hash.get(page_id, chain))
buf_page_t *hash_page= buf_pool.page_hash.get(page_id, chain);
if (hash_page)
{
page_exists:
hash_lock.unlock();
page_exists:
/* The page is already in the buffer pool. */
ut_d(const uint32_t state=) hash_page->fix();
ut_ad(state >= buf_page_t::FREED);
hash_lock.unlock();
if (bpage)
{
bpage->lock.x_unlock(true);
@ -102,7 +107,7 @@ page_exists:
ut_d(bpage->set_state(buf_page_t::MEMORY));
ut_d(mysql_mutex_unlock(&buf_pool.mutex));
}
return nullptr;
return reinterpret_cast<buf_page_t*>(uintptr_t(hash_page) | 1);
}
if (UNIV_UNLIKELY(mysql_mutex_trylock(&buf_pool.mutex)))
@ -110,7 +115,8 @@ page_exists:
hash_lock.unlock();
mysql_mutex_lock(&buf_pool.mutex);
hash_lock.lock();
if (buf_pool.page_hash.get(page_id, chain))
hash_page= buf_pool.page_hash.get(page_id, chain);
if (hash_page)
{
mysql_mutex_unlock(&buf_pool.mutex);
goto page_exists;
@ -160,11 +166,15 @@ page_exists:
check the page_hash again, as it may have been modified. */
if (UNIV_UNLIKELY(lru))
{
if (UNIV_LIKELY_NULL(buf_pool.page_hash.get(page_id, chain)))
hash_page= buf_pool.page_hash.get(page_id, chain);
if (UNIV_LIKELY_NULL(hash_page))
{
/* The block was added by some other thread. */
ut_d(const uint32_t state=) hash_page->fix();
ut_ad(state >= buf_page_t::FREED);
buf_buddy_free(data, zip_size);
goto func_exit;
mysql_mutex_unlock(&buf_pool.mutex);
return reinterpret_cast<buf_page_t*>(uintptr_t(hash_page) | 1);
}
}
@ -175,7 +185,7 @@ page_exists:
bpage->zip.data = (page_zip_t*) data;
bpage->lock.init();
bpage->init(buf_page_t::READ_FIX, page_id);
bpage->init(READ_BUF_FIX, page_id);
bpage->lock.x_lock(true);
{
@ -190,10 +200,8 @@ page_exists:
}
buf_pool.stat.n_pages_read++;
func_exit:
mysql_mutex_unlock(&buf_pool.mutex);
ut_ad(!bpage || bpage->in_file());
mysql_mutex_unlock(&buf_pool.mutex);
return bpage;
}
@ -205,76 +213,148 @@ flag is cleared and the x-lock released by an i/o-handler thread.
@param[in] page_id page id
@param[in] zip_size 0 or ROW_FORMAT=COMPRESSED page size
bitwise-ORed with 1 to allocate an uncompressed frame
@param[out] err nullptr for asynchronous; error code for synchronous:
DB_SUCCESS if the page was successfully read,
DB_SUCCESS_LOCKED_REC if the exists in the pool,
DB_PAGE_CORRUPTED on page checksum mismatch,
DB_DECRYPTION_FAILED if page post encryption checksum
matches but after decryption normal page checksum
does not match
@param[in,out] chain buf_pool.page_hash cell for page_id
@param[in,out] space tablespace
@param[in,out] block preallocated buffer block
@param[in] sync true if synchronous aio is desired
@return error code
@retval DB_SUCCESS if the page was read
@retval DB_SUCCESS_LOCKED_REC if the page exists in the buffer pool already */
@param[in,out] stats per-thread statistics
@return buffer-fixed block (*err may be set to DB_SUCCESS_LOCKED_REC)
@retval -1 if err==nullptr and an asynchronous read was submitted
@retval -2 if err==nullptr and the page exists in the buffer pool
@retval nullptr if the page was not successfully read (*err will be set) */
static
dberr_t
buf_page_t*
buf_read_page_low(
const page_id_t page_id,
ulint zip_size,
dberr_t* err,
buf_pool_t::hash_chain& chain,
fil_space_t* space,
buf_block_t*& block,
bool sync = false) noexcept
ha_handler_stats* stats) noexcept
{
buf_page_t* bpage;
if (buf_dblwr.is_inside(page_id))
{
fail:
space->release();
if (err)
*err= DB_PAGE_CORRUPTED;
return nullptr;
}
if (buf_dblwr.is_inside(page_id)) {
space->release();
return DB_PAGE_CORRUPTED;
}
buf_page_t *bpage= buf_page_init_for_read(page_id, zip_size, chain, block);
if (UNIV_UNLIKELY(!bpage))
goto fail;
const bool exist(uintptr_t(bpage) & 1);
bpage= reinterpret_cast<buf_page_t*>(uintptr_t(bpage) & ~uintptr_t{1});
ulonglong start= 0;
if (exist)
{
if (!err)
{
bpage->unfix();
bpage= reinterpret_cast<buf_page_t*>(-2);
}
else
{
const uint32_t state{bpage->state()};
IF_DBUG(page_id_t id{bpage->id()},);
ut_ad(state > buf_page_t::FREED);
if (state < buf_page_t::UNFIXED)
{
corrupted:
DBUG_ASSERT(id == page_id || id == page_id_t{~0ULL});
bpage->unfix();
bpage= nullptr;
*err= DB_PAGE_CORRUPTED;
}
else if (!bpage->is_read_fixed(state))
*err= DB_SUCCESS_LOCKED_REC;
else if (bpage->read_wait(IF_DBUG(&id,nullptr), stats) <
buf_page_t::UNFIXED)
goto corrupted;
}
bpage = buf_page_init_for_read(page_id, zip_size, chain, block);
space->release();
return bpage;
}
if (!bpage) {
space->release();
return DB_SUCCESS_LOCKED_REC;
}
ut_ad(bpage->in_file());
if (err)
{
thd_wait_begin(nullptr, THD_WAIT_DISKIO);
if (stats != nullptr && stats->active)
start= mariadb_measure();
}
ut_ad(bpage->in_file());
ulonglong mariadb_timer = 0;
void* dst= zip_size > 1 ? bpage->zip.data : bpage->frame;
const ulint len= zip_size & ~1 ? zip_size & ~1 : srv_page_size;
if (sync) {
thd_wait_begin(nullptr, THD_WAIT_DISKIO);
if (const ha_handler_stats *stats = mariadb_stats) {
if (stats->active) {
mariadb_timer = mariadb_measure();
}
}
}
auto fio=
space->io(IORequest(err
? IORequest::READ_SYNC : IORequest::READ_ASYNC),
os_offset_t{page_id.page_no()} * len, len, dst, bpage);
DBUG_LOG("ib_buf",
"read page " << page_id << " zip_size=" << zip_size
<< (sync ? " sync" : " async"));
if (UNIV_UNLIKELY(fio.err != DB_SUCCESS))
{
if (UNIV_LIKELY(err != nullptr))
*err= fio.err;
recv_sys.free_corrupted_page(page_id, *space->chain.start);
buf_pool.corrupted_evict(bpage, buf_page_t::READ_FIX + 1);
bpage= nullptr;
}
else if (err != nullptr)
{
const bool recovery{recv_sys.recovery_on};
if ((*err= fio.err= bpage->read_complete(*fio.node, recovery)))
bpage= nullptr;
thd_wait_end(nullptr);
space->release();
if (stats)
{
stats->pages_read_count++;
if (start)
stats->pages_read_time+= mariadb_measure() - start;
}
void* dst = zip_size > 1 ? bpage->zip.data : bpage->frame;
const ulint len = zip_size & ~1 ? zip_size & ~1 : srv_page_size;
/* FIXME: Remove this, and accumulate stats->pages_read_count to
global statistics somewhere! */
buf_LRU_stat_inc_io();
mysql_mutex_assert_not_owner(&buf_pool.mutex);
}
else
bpage= reinterpret_cast<buf_page_t*>(-1);
auto fio = space->io(IORequest(sync
? IORequest::READ_SYNC
: IORequest::READ_ASYNC),
os_offset_t{page_id.page_no()} * len, len,
dst, bpage);
return bpage;
}
if (UNIV_UNLIKELY(fio.err != DB_SUCCESS)) {
recv_sys.free_corrupted_page(page_id, *space->chain.start);
buf_pool.corrupted_evict(bpage, buf_page_t::READ_FIX);
} else if (sync) {
thd_wait_end(nullptr);
/* The i/o was already completed in space->io() */
fio.err = bpage->read_complete(*fio.node);
space->release();
if (mariadb_timer) {
mariadb_increment_pages_read_time(mariadb_timer);
}
}
/** Update the read-ahead statistics.
@param count number of pages that were read ahead */
static void buf_read_ahead_update(size_t count) noexcept
{
mysql_mutex_lock(&buf_pool.mutex);
/* Read ahead is considered one I/O operation for the purpose of
LRU policy decision. */
buf_LRU_stat_inc_io();
buf_pool.stat.n_ra_pages_read+= count;
mysql_mutex_unlock(&buf_pool.mutex);
}
return fio.err;
/** Update the statistics for a read-ahead that is triggered from SQL.
@param count number of pages that were read ahead
@param stats per-thread statistics */
static void buf_read_ahead_update_sql(size_t count, ha_handler_stats *stats)
noexcept
{
if (stats)
stats->pages_prefetched+= count;
buf_read_ahead_update(count);
}
/** Acquire a buffer block. */
@ -284,7 +364,7 @@ static buf_block_t *buf_read_acquire()
}
/** Free a buffer block if needed. */
static void buf_read_release(buf_block_t *block)
static buf_block_t *buf_read_release(buf_block_t *block) noexcept
{
if (block)
{
@ -292,6 +372,8 @@ static void buf_read_release(buf_block_t *block)
buf_LRU_block_free_non_file_page(block);
mysql_mutex_unlock(&buf_pool.mutex);
}
return block;
}
/** Applies a random read-ahead in buf_pool if there are at least a threshold
@ -364,13 +446,15 @@ read_ahead:
goto allocate_block;
}
/* Read all the suitable blocks within the area */
for (page_id_t i= low; i < high; ++i)
{
if (space->is_stopping())
break;
buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(i.fold());
space->reacquire();
if (buf_read_page_low(i, zip_size, chain, space, block) == DB_SUCCESS)
if (reinterpret_cast<buf_page_t*>(-1) ==
buf_read_page_low(i, zip_size, nullptr, chain, space, block, nullptr))
{
count++;
ut_ad(!block);
@ -382,16 +466,10 @@ read_ahead:
if (count)
{
mariadb_increment_pages_prefetched(count);
DBUG_PRINT("ib_buf", ("random read-ahead %zu pages from %s: %u",
count, space->chain.start->name,
low.page_no()));
mysql_mutex_lock(&buf_pool.mutex);
/* Read ahead is considered one I/O operation for the purpose of
LRU policy decision. */
buf_LRU_stat_inc_io();
buf_pool.stat.n_ra_pages_read_rnd+= count;
mysql_mutex_unlock(&buf_pool.mutex);
buf_read_ahead_update_sql(count, mariadb_stats);
}
space->release();
@ -399,10 +477,13 @@ read_ahead:
return count;
}
dberr_t buf_read_page(const page_id_t page_id,
buf_pool_t::hash_chain &chain, bool unzip) noexcept
buf_block_t *buf_read_page(const page_id_t page_id, dberr_t *err,
buf_pool_t::hash_chain &chain, bool unzip) noexcept
{
fil_space_t *space= fil_space_t::get(page_id.space());
dberr_t local_err;
if (!err)
err= &local_err;
if (UNIV_UNLIKELY(!space))
{
sql_print_information("InnoDB: trying to read page "
@ -410,7 +491,8 @@ dberr_t buf_read_page(const page_id_t page_id,
", page number=" UINT32PF "]"
" in nonexisting or being-dropped tablespace",
page_id.space(), page_id.page_no());
return DB_TABLESPACE_DELETED;
*err= DB_TABLESPACE_DELETED;
return nullptr;
}
/* Our caller should already have ensured that the page does not
@ -422,7 +504,6 @@ dberr_t buf_read_page(const page_id_t page_id,
{
allocate_block:
mysql_mutex_lock(&buf_pool.mutex);
buf_LRU_stat_inc_io();
block= buf_LRU_get_free_block(have_mutex);
mysql_mutex_unlock(&buf_pool.mutex);
}
@ -432,54 +513,41 @@ dberr_t buf_read_page(const page_id_t page_id,
goto allocate_block;
}
dberr_t err= buf_read_page_low(page_id, zip_size, chain, space, block, true);
buf_page_t *b= buf_read_page_low(page_id, zip_size, err, chain, space,
block, mariadb_stats);
buf_read_release(block);
return err;
return reinterpret_cast<buf_block_t*>(b);
}
/** High-level function which reads a page asynchronously from a file to the
buffer buf_pool if it is not already there. Sets the io_fix flag and sets
an exclusive lock on the buffer frame. The flag is cleared and the x-lock
released by the i/o-handler thread.
@param[in,out] space tablespace
@param[in] page_id page id
@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 */
void buf_read_page_background(fil_space_t *space, const page_id_t page_id,
ulint zip_size) noexcept
/** Read a page asynchronously into buf_pool if it is not already there.
@param page_id page identifier
@param space tablespace
@param stats statistics */
void buf_read_page_background(const page_id_t page_id, fil_space_t *space,
ha_handler_stats *stats) noexcept
{
ut_ad(!recv_recovery_is_on());
buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold());
if (buf_pool.page_hash_contains(page_id, chain))
{
skip:
space->release();
return;
}
buf_block_t *block= nullptr;
if (UNIV_LIKELY(!zip_size))
{
allocate_block:
if (UNIV_UNLIKELY(!(block= buf_read_acquire())))
goto skip;
}
else if (recv_recovery_is_on())
{
zip_size|= 1;
goto allocate_block;
}
if (buf_read_page_low(page_id, zip_size, chain, space, block) ==
DB_SUCCESS)
ut_ad(!block);
else
buf_read_release(block);
/* We do not increment number of I/O operations used for LRU policy
here (buf_LRU_stat_inc_io()). We use this in heuristics to decide
about evicting uncompressed version of ROW_FORMAT=COMPRESSED pages
from the buffer pool. Since this function is called from buffer pool
load these IOs are deliberate and are not part of normal workload we
can ignore these in our heuristics. */
{
buf_block_t *b= nullptr;
ulint zip_size{space->zip_size()};
if (UNIV_LIKELY(!zip_size) && UNIV_UNLIKELY(!(b= buf_read_acquire())))
goto skip;
buf_read_page_low(page_id, zip_size, nullptr, chain, space, b, nullptr);
if (!buf_read_release(b) && stats)
{
stats->pages_prefetched++;
buf_read_ahead_update(1);
}
/* buf_load() invokes this with stats=nullptr. In that case, we skip
the call to buf_read_ahead_update() or buf_LRU_stat_inc_io(); these
deliberate page reads are not part of a normal workload and therefore
should not affect the unzip_LRU heuristics. */
}
}
/** Applies linear read-ahead if in the buf_pool the page is a border page of
@ -662,8 +730,9 @@ failed:
break;
buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(new_low.fold());
space->reacquire();
if (buf_read_page_low(new_low, zip_size, chain, space, block) ==
DB_SUCCESS)
if (reinterpret_cast<buf_page_t*>(-1) ==
buf_read_page_low(new_low, zip_size, nullptr,
chain, space, block, nullptr))
{
count++;
ut_ad(!block);
@ -675,16 +744,10 @@ failed:
if (count)
{
mariadb_increment_pages_prefetched(count);
DBUG_PRINT("ib_buf", ("random read-ahead %zu pages from %s: %u",
DBUG_PRINT("ib_buf", ("linear read-ahead %zu pages from %s: %u",
count, space->chain.start->name,
new_low.page_no()));
mysql_mutex_lock(&buf_pool.mutex);
/* Read ahead is considered one I/O operation for the purpose of
LRU policy decision. */
buf_LRU_stat_inc_io();
buf_pool.stat.n_ra_pages_read+= count;
mysql_mutex_unlock(&buf_pool.mutex);
buf_read_ahead_update_sql(count, mariadb_stats);
}
space->release();
@ -708,29 +771,23 @@ void buf_read_recover(fil_space_t *space, const page_id_t page_id,
if (init_lsn)
{
if (buf_page_t *bpage=
buf_page_init_for_read(page_id, zip_size, chain, block))
{
ut_ad(bpage->in_file());
buf_page_t *bpage= buf_page_init_for_read(page_id, zip_size, chain, block);
if (UNIV_UNLIKELY(!bpage))
goto fail;
const bool exist(uintptr_t(bpage) & 1);
bpage= reinterpret_cast<buf_page_t*>(uintptr_t(bpage) & ~uintptr_t{1});
bpage->unfix();
if (!exist)
os_fake_read(IORequest{bpage, (buf_tmp_buffer_t*) &recs,
UT_LIST_GET_FIRST(space->chain),
IORequest::READ_ASYNC}, init_lsn);
return;
}
}
else if (dberr_t err=
buf_read_page_low(page_id, zip_size, chain, space, block))
{
if (err != DB_SUCCESS_LOCKED_REC)
sql_print_error("InnoDB: Recovery failed to read page "
UINT32PF " from %s",
page_id.page_no(), space->chain.start->name);
}
else
{
ut_ad(!block);
return;
}
else if (!buf_read_page_low(page_id, zip_size, nullptr, chain, space, block,
nullptr))
fail:
sql_print_error("InnoDB: Recovery failed to read page %" PRIu32 " from %s",
page_id.page_no(), space->chain.start->name);
buf_LRU_block_free_non_file_page(block);
buf_read_release(block);
}

View File

@ -2927,25 +2927,31 @@ void IORequest::read_complete(int io_error) const noexcept
ut_ad(node);
ut_ad(is_read());
ut_ad(bpage);
ut_d(auto s= bpage->state());
ut_ad(s > buf_page_t::READ_FIX);
ut_ad(s <= buf_page_t::WRITE_FIX);
const page_id_t id(bpage->id());
const bool in_recovery{recv_sys.recovery_on};
if (UNIV_UNLIKELY(io_error != 0))
{
sql_print_error("InnoDB: Read error %d of page " UINT32PF " in file %s",
io_error, id.page_no(), node->name);
recv_sys.free_corrupted_page(id, *node);
buf_pool.corrupted_evict(bpage, buf_page_t::READ_FIX);
buf_pool.corrupted_evict(bpage, buf_page_t::READ_FIX + 1);
corrupted:
if (recv_recovery_is_on() && !srv_force_recovery)
if (in_recovery && !srv_force_recovery)
{
mysql_mutex_lock(&recv_sys.mutex);
recv_sys.set_corrupt_fs();
mysql_mutex_unlock(&recv_sys.mutex);
}
}
else if (bpage->read_complete(*node))
else if (bpage->read_complete(*node, in_recovery))
goto corrupted;
else
bpage->unfix();
node->space->release();
}

View File

@ -39,6 +39,7 @@ Created 11/5/1995 Heikki Tuuri
#include "log0log.h"
#include "srv0srv.h"
#include "transactional_lock_guard.h"
#include "ha_handler_stats.h"
#include <ostream>
/** The allocation granularity of innodb_buffer_pool_size */
@ -703,10 +704,18 @@ public:
/** Complete a read of a page.
@param node data file
@param recovery recv_recovery_is_on()
@return whether the operation succeeded
@retval DB_SUCCESS if the read succeeded; caller must unfix()
@retval DB_PAGE_CORRUPTED if the checksum or the page ID is incorrect
@retval DB_DECRYPTION_FAILED if the page cannot be decrypted */
dberr_t read_complete(const fil_node_t &node) noexcept;
dberr_t read_complete(const fil_node_t &node, bool recovery) noexcept;
/** Wait for read_complete().
@param page_id id() at the time we were holding lock
@param stats per-thread statistics to update
@return state() at the time we were holding lock */
uint32_t read_wait(page_id_t *page_id, ha_handler_stats *stats) noexcept;
/** Release a write fix after a page write was completed.
@param persistent whether the page belongs to a persistent tablespace
@ -1842,7 +1851,7 @@ inline void buf_page_t::set_state(uint32_t s) noexcept
mysql_mutex_assert_owner(&buf_pool.mutex);
ut_ad(s <= REMOVE_HASH || s >= UNFIXED);
ut_ad(s < WRITE_FIX);
ut_ad(s <= READ_FIX || zip.fix == READ_FIX);
ut_ad(s <= READ_FIX + 1 || zip.fix == READ_FIX + 1);
zip.fix= s;
}

View File

@ -32,27 +32,26 @@ Created 11/5/1995 Heikki Tuuri
will be invoked on read completion.
@param page_id page identifier
@param chain buf_pool.page_hash cell for page_id
@param err error code: DB_SUCCESS if the page was successfully read,
DB_SUCCESS_LOCKED_REC if the page was not read,
DB_PAGE_CORRUPTED on page checksum mismatch,
DB_DECRYPTION_FAILED if page post encryption checksum matches but
after decryption normal page checksum does not match,
DB_TABLESPACE_DELETED if tablespace .ibd file is missing
@param unzip whether to decompress ROW_FORMAT=COMPRESSED pages
@retval DB_SUCCESS if the page was read and is not corrupted
@retval DB_SUCCESS_LOCKED_REC if the page was not read
@retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted,
@retval DB_DECRYPTION_FAILED if page post encryption checksum matches but
after decryption normal page checksum does not match.
@retval DB_TABLESPACE_DELETED if tablespace .ibd file is missing */
dberr_t buf_read_page(const page_id_t page_id,
buf_pool_t::hash_chain &chain, bool unzip= true)
@return buffer-fixed block (*err may be set to DB_SUCCESS_LOCKED_REC)
@retval nullptr if the page is not available (*err will be set) */
buf_block_t *buf_read_page(const page_id_t page_id, dberr_t *err,
buf_pool_t::hash_chain &chain, bool unzip= true)
noexcept;
/** High-level function which reads a page asynchronously from a file to the
buffer buf_pool if it is not already there. Sets the io_fix flag and sets
an exclusive lock on the buffer frame. The flag is cleared and the x-lock
released by the i/o-handler thread.
@param[in,out] space tablespace
@param[in] page_id page id
@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 */
void buf_read_page_background(fil_space_t *space, const page_id_t page_id,
ulint zip_size) noexcept
MY_ATTRIBUTE((nonnull));
/** Read a page asynchronously into buf_pool if it is not already there.
@param page_id page identifier
@param space tablespace
@param stats statistics */
void buf_read_page_background(const page_id_t page_id, fil_space_t *space,
ha_handler_stats *stats) noexcept
MY_ATTRIBUTE((nonnull(2)));
/** Applies a random read-ahead in buf_pool if there are at least a threshold
value of accessed pages from the random read-ahead area. Does not read any

View File

@ -100,23 +100,6 @@ inline ulonglong mariadb_measure()
#endif
}
/*
Call this only of start_time != 0
See buf0rea.cc for an example of how to use it efficiently
*/
inline void mariadb_increment_pages_read_time(ulonglong start_time)
{
ha_handler_stats *stats= mariadb_stats;
ulonglong end_time= mariadb_measure();
/* Check that we only call this if active, see example! */
DBUG_ASSERT(start_time);
DBUG_ASSERT(stats->active);
stats->pages_read_time+= (end_time - start_time);
}
/*
Helper class to set mariadb_stats temporarly for one call in handler.cc
*/