Re: Track IO times in pg_stat_io - Mailing list pgsql-hackers
From | Andres Freund |
---|---|
Subject | Re: Track IO times in pg_stat_io |
Date | |
Msg-id | 20230307183929.3qbjjpu6untcypb7@awork3.anarazel.de Whole thread Raw |
In response to | Re: Track IO times in pg_stat_io (Melanie Plageman <melanieplageman@gmail.com>) |
Responses |
Re: Track IO times in pg_stat_io
|
List | pgsql-hackers |
Hi, On 2023-03-06 11:30:13 -0500, Melanie Plageman wrote: > > As pgstat_bktype_io_stats_valid() is called only in Assert(), I think that would be a good idea > > to also check that if counts are not Zero then times are not Zero. > > Yes, I think adding some validation around the relationship between > counts and timing should help prevent developers from forgetting to call > pg_stat_count_io_op() when calling pgstat_count_io_time() (as relevant). > > However, I think that we cannot check that if IO counts are non-zero > that IO times are non-zero, because the user may not have > track_io_timing enabled. We can check that if IO times are not zero, IO > counts are not zero. I've done this in the attached v3. And even if track_io_timing is enabled, the timer granularity might be so low that we *still* get zeroes. I wonder if we should get rid of pgStatBlockReadTime, pgStatBlockWriteTime, > @@ -1000,11 +1000,27 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, > > if (isExtend) > { > + instr_time io_start, > + io_time; > + > /* new buffers are zero-filled */ > MemSet((char *) bufBlock, 0, BLCKSZ); > + > + if (track_io_timing) > + INSTR_TIME_SET_CURRENT(io_start); > + else > + INSTR_TIME_SET_ZERO(io_start); > + I wonder if there's an argument for tracking this in the existing IO stats as well. But I guess we've lived with this for a long time... > @@ -2981,16 +2998,16 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, > * When a strategy is not in use, the write can only be a "regular" write > * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE). > */ > - pgstat_count_io_op(IOOBJECT_RELATION, io_context, IOOP_WRITE); > - > if (track_io_timing) > { > INSTR_TIME_SET_CURRENT(io_time); > INSTR_TIME_SUBTRACT(io_time, io_start); > pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time)); > INSTR_TIME_ADD(pgBufferUsage.blk_write_time, io_time); > + pgstat_count_io_time(IOOBJECT_RELATION, io_context, IOOP_WRITE, io_time); > } I think this needs a bit of cleanup - pgstat_count_buffer_write_time(), pgBufferUsage.blk_write_time++, pgstat_count_io_time() is a bit excessive. We might not be able to reduce the whole duplication at this point, but at least it should be a bit more centralized. > + pgstat_count_io_op(IOOBJECT_RELATION, io_context, IOOP_WRITE); > pgBufferUsage.shared_blks_written++; > > /* > @@ -3594,6 +3611,9 @@ FlushRelationBuffers(Relation rel) > > if (RelationUsesLocalBuffers(rel)) > { > + instr_time io_start, > + io_time; > + > for (i = 0; i < NLocBuffer; i++) > { > uint32 buf_state; > @@ -3616,6 +3636,11 @@ FlushRelationBuffers(Relation rel) > > PageSetChecksumInplace(localpage, bufHdr->tag.blockNum); > > + if (track_io_timing) > + INSTR_TIME_SET_CURRENT(io_start); > + else > + INSTR_TIME_SET_ZERO(io_start); > + > smgrwrite(RelationGetSmgr(rel), > BufTagGetForkNum(&bufHdr->tag), > bufHdr->tag.blockNum, I don't think you need the INSTR_TIME_SET_ZERO() in the body of the loop, to silence the compiler warnings you can do it one level up. > @@ -228,6 +230,11 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, > > PageSetChecksumInplace(localpage, bufHdr->tag.blockNum); > > + if (track_io_timing) > + INSTR_TIME_SET_CURRENT(io_start); > + else > + INSTR_TIME_SET_ZERO(io_start); > + > /* And write... */ > smgrwrite(oreln, > BufTagGetForkNum(&bufHdr->tag), > @@ -239,6 +246,13 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, > buf_state &= ~BM_DIRTY; > pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state); > > + if (track_io_timing) > + { > + INSTR_TIME_SET_CURRENT(io_time); > + INSTR_TIME_SUBTRACT(io_time, io_start); > + pgstat_count_io_time(IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL, IOOP_WRITE, io_time); > + } > + > pgstat_count_io_op(IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL, IOOP_WRITE); > pgBufferUsage.local_blks_written++; > } Perhaps we can instead introduce a FlushLocalBuffer()? Then we don't need this in multiple write paths. > diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c > index 352958e1fe..052875d86a 100644 > --- a/src/backend/storage/smgr/md.c > +++ b/src/backend/storage/smgr/md.c > @@ -1030,6 +1030,30 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) > > if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ )) > { > + instr_time io_start, > + io_time; > + > + if (track_io_timing) > + INSTR_TIME_SET_CURRENT(io_start); > + else > + INSTR_TIME_SET_ZERO(io_start); > + > + ereport(DEBUG1, > + (errmsg_internal("could not forward fsync request because request queue is full"))); > + > + if (FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) < 0) > + ereport(data_sync_elevel(ERROR), > + (errcode_for_file_access(), > + errmsg("could not fsync file \"%s\": %m", > + FilePathName(seg->mdfd_vfd)))); > + > + if (track_io_timing) > + { > + INSTR_TIME_SET_CURRENT(io_time); > + INSTR_TIME_SUBTRACT(io_time, io_start); > + pgstat_count_io_time(IOOBJECT_RELATION, IOCONTEXT_NORMAL, IOOP_FSYNC, io_time); > + } > + > /* > * We have no way of knowing if the current IOContext is > * IOCONTEXT_NORMAL or IOCONTEXT_[BULKREAD, BULKWRITE, VACUUM] at this > @@ -1042,15 +1066,6 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) > * backend fsyncs. > */ > pgstat_count_io_op(IOOBJECT_RELATION, IOCONTEXT_NORMAL, IOOP_FSYNC); > - > - ereport(DEBUG1, > - (errmsg_internal("could not forward fsync request because request queue is full"))); > - > - if (FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) < 0) > - ereport(data_sync_elevel(ERROR), > - (errcode_for_file_access(), > - errmsg("could not fsync file \"%s\": %m", > - FilePathName(seg->mdfd_vfd)))); > } > } > > @@ -1399,6 +1414,8 @@ int > mdsyncfiletag(const FileTag *ftag, char *path) > { > SMgrRelation reln = smgropen(ftag->rlocator, InvalidBackendId); > + instr_time io_start, > + io_time; > File file; > bool need_to_close; > int result, > @@ -1425,10 +1442,22 @@ mdsyncfiletag(const FileTag *ftag, char *path) > need_to_close = true; > } > > + if (track_io_timing) > + INSTR_TIME_SET_CURRENT(io_start); > + else > + INSTR_TIME_SET_ZERO(io_start); > + > /* Sync the file. */ > result = FileSync(file, WAIT_EVENT_DATA_FILE_SYNC); > save_errno = errno; > > + if (track_io_timing) > + { > + INSTR_TIME_SET_CURRENT(io_time); > + INSTR_TIME_SUBTRACT(io_time, io_start); > + pgstat_count_io_time(IOOBJECT_RELATION, IOCONTEXT_NORMAL, IOOP_FSYNC, io_time); > + } > + > if (need_to_close) > FileClose(file); Perhaps we could have mdsyncfd(), used by both mdsyncfiletag() and register_dirty_segment()? > @@ -1359,20 +1378,31 @@ pg_stat_get_io(PG_FUNCTION_ARGS) > > for (int io_op = 0; io_op < IOOP_NUM_TYPES; io_op++) > { > - int col_idx = pgstat_get_io_op_index(io_op); > + int i = pgstat_get_io_op_index(io_op); > > /* > * Some combinations of BackendType and IOOp, of IOContext > * and IOOp, and of IOObject and IOOp are not tracked. Set > * these cells in the view NULL. > */ > - nulls[col_idx] = !pgstat_tracks_io_op(bktype, io_obj, io_context, io_op); > + if (pgstat_tracks_io_op(bktype, io_obj, io_context, io_op)) > + values[i] = Int64GetDatum(bktype_stats->counts[io_obj][io_context][io_op]); > + else > + nulls[i] = true; > + } These lines were already too long, and it's getting worse with this change. > typedef struct PgStat_BktypeIO > { > - PgStat_Counter data[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES]; > + PgStat_Counter counts[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES]; > + instr_time times[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES]; > } PgStat_BktypeIO; Ah, you're going to hate me. We can't store instr_time on disk. There's another patch that gets substantial peformance gains by varying the frequency at which instr_time keeps track of time based on the CPU frequency... It also just doesn't have enough range to keep track of system wide time on a larger system. A single backend won't run for 293 years, but with a few thousand backends that's a whole different story. I think we need to accumulate in instr_time, but convert to floating point when flushing stats. Greetings, Andres Freund
pgsql-hackers by date: