Fix mdsync never-ending loop problem - Mailing list pgsql-patches
From | Heikki Linnakangas |
---|---|
Subject | Fix mdsync never-ending loop problem |
Date | |
Msg-id | 4614D38F.2060902@enterprisedb.com Whole thread Raw |
Responses |
Re: Fix mdsync never-ending loop problem
Re: Fix mdsync never-ending loop problem Re: Fix mdsync never-ending loop problem |
List | pgsql-patches |
Here's a fix for the problem that on a busy system, mdsync never finishes. See the original problem description on hackers: http://archives.postgresql.org/pgsql-hackers/2007-04/msg00259.php The solution is taken from ITAGAKI Takahiro's Load Distributed Checkpoint patch. At the beginning of mdsync, the pendingOpsTable is copied to a linked list, and that list is then processed until it's empty. -- Heikki Linnakangas EnterpriseDB http://www.enterprisedb.com Index: src/backend/storage/smgr/md.c =================================================================== RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/storage/smgr/md.c,v retrieving revision 1.127 diff -c -r1.127 md.c *** src/backend/storage/smgr/md.c 17 Jan 2007 16:25:01 -0000 1.127 --- src/backend/storage/smgr/md.c 5 Apr 2007 10:43:56 -0000 *************** *** 863,989 **** void mdsync(void) { ! bool need_retry; if (!pendingOpsTable) elog(ERROR, "cannot sync without a pendingOpsTable"); /* ! * The fsync table could contain requests to fsync relations that have ! * been deleted (unlinked) by the time we get to them. Rather than ! * just hoping an ENOENT (or EACCES on Windows) error can be ignored, ! * what we will do is retry the whole process after absorbing fsync ! * request messages again. Since mdunlink() queues a "revoke" message ! * before actually unlinking, the fsync request is guaranteed to be gone ! * the second time if it really was this case. DROP DATABASE likewise ! * has to tell us to forget fsync requests before it starts deletions. */ ! do { ! HASH_SEQ_STATUS hstat; ! PendingOperationEntry *entry; ! int absorb_counter; ! need_retry = false; /* ! * If we are in the bgwriter, the sync had better include all fsync ! * requests that were queued by backends before the checkpoint REDO ! * point was determined. We go that a little better by accepting all ! * requests queued up to the point where we start fsync'ing. */ AbsorbFsyncRequests(); ! absorb_counter = FSYNCS_PER_ABSORB; ! hash_seq_init(&hstat, pendingOpsTable); ! while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) { ! /* ! * If fsync is off then we don't have to bother opening the file ! * at all. (We delay checking until this point so that changing ! * fsync on the fly behaves sensibly.) ! */ ! if (enableFsync) ! { ! SMgrRelation reln; ! MdfdVec *seg; ! /* ! * If in bgwriter, we want to absorb pending requests every so ! * often to prevent overflow of the fsync request queue. This ! * could result in deleting the current entry out from under ! * our hashtable scan, so the procedure is to fall out of the ! * scan and start over from the top of the function. ! */ ! if (--absorb_counter <= 0) ! { ! need_retry = true; ! break; ! } ! /* ! * Find or create an smgr hash entry for this relation. This ! * may seem a bit unclean -- md calling smgr? But it's really ! * the best solution. It ensures that the open file reference ! * isn't permanently leaked if we get an error here. (You may ! * say "but an unreferenced SMgrRelation is still a leak!" Not ! * really, because the only case in which a checkpoint is done ! * by a process that isn't about to shut down is in the ! * bgwriter, and it will periodically do smgrcloseall(). This ! * fact justifies our not closing the reln in the success path ! * either, which is a good thing since in non-bgwriter cases ! * we couldn't safely do that.) Furthermore, in many cases ! * the relation will have been dirtied through this same smgr ! * relation, and so we can save a file open/close cycle. ! */ ! reln = smgropen(entry->tag.rnode); ! ! /* ! * It is possible that the relation has been dropped or ! * truncated since the fsync request was entered. Therefore, ! * allow ENOENT, but only if we didn't fail once already on ! * this file. This applies both during _mdfd_getseg() and ! * during FileSync, since fd.c might have closed the file ! * behind our back. ! */ ! seg = _mdfd_getseg(reln, ! entry->tag.segno * ((BlockNumber) RELSEG_SIZE), ! false, EXTENSION_RETURN_NULL); ! if (seg == NULL || ! FileSync(seg->mdfd_vfd) < 0) ! { ! /* ! * XXX is there any point in allowing more than one try? ! * Don't see one at the moment, but easy to change the ! * test here if so. ! */ ! if (!FILE_POSSIBLY_DELETED(errno) || ! ++(entry->failures) > 1) ! ereport(ERROR, ! (errcode_for_file_access(), ! errmsg("could not fsync segment %u of relation %u/%u/%u: %m", ! entry->tag.segno, ! entry->tag.rnode.spcNode, ! entry->tag.rnode.dbNode, ! entry->tag.rnode.relNode))); ! else ! ereport(DEBUG1, ! (errcode_for_file_access(), ! errmsg("could not fsync segment %u of relation %u/%u/%u, but retrying: %m", ! entry->tag.segno, ! entry->tag.rnode.spcNode, ! entry->tag.rnode.dbNode, ! entry->tag.rnode.relNode))); ! need_retry = true; ! continue; /* don't delete the hashtable entry */ ! } ! } /* Okay, delete this entry */ if (hash_search(pendingOpsTable, &entry->tag, HASH_REMOVE, NULL) == NULL) elog(ERROR, "pendingOpsTable corrupted"); } ! } while (need_retry); } /* --- 863,1012 ---- void mdsync(void) { ! HASH_SEQ_STATUS hstat; ! List *syncOps; ! ListCell *cell; ! PendingOperationEntry *entry; if (!pendingOpsTable) elog(ERROR, "cannot sync without a pendingOpsTable"); /* ! * If fsync=off, mdsync is a no-op. Just clear the pendingOpsTable. */ ! if(!enableFsync) ! { ! hash_seq_init(&hstat, pendingOpsTable); ! while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) ! { ! if (hash_search(pendingOpsTable, &entry->tag, ! HASH_REMOVE, NULL) == NULL) ! elog(ERROR, "pendingOpsTable corrupted"); ! } ! return; ! } ! ! /* ! * If we are in the bgwriter, the sync had better include all fsync ! * requests that were queued by backends before the checkpoint REDO ! * point was determined. We go that a little better by accepting all ! * requests queued up to the point where we start fsync'ing. ! */ ! AbsorbFsyncRequests(); ! ! /* Take a snapshot of pendingOpsTable into a list. We don't want to ! * scan through pendingOpsTable directly in the loop because we call ! * AbsorbFsyncRequests inside it which modifies the table. ! */ ! syncOps = NULL; ! hash_seq_init(&hstat, pendingOpsTable); ! while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) ! { ! PendingOperationEntry *dup; ! ! /* Entries could be deleted in our scan, so copy them. */ ! dup = (PendingOperationEntry *) palloc(sizeof(PendingOperationEntry)); ! memcpy(dup, entry, sizeof(PendingOperationEntry)); ! syncOps = lappend(syncOps, dup); ! } ! ! /* Process fsync requests until the list is empty */ ! while((cell = list_head(syncOps)) != NULL) ! { ! SMgrRelation reln; ! MdfdVec *seg; ! entry = lfirst(cell); /* ! * If in bgwriter, we want to absorb pending requests every so ! * often to prevent overflow of the fsync request queue. */ AbsorbFsyncRequests(); ! /* Check that the entry is still in pendingOpsTable. It could've ! * been deleted by an absorbed relation or database deletion event. ! */ ! entry = hash_search(pendingOpsTable, &entry->tag, HASH_FIND, NULL); ! if(entry == NULL) { ! list_delete_cell(syncOps, cell, NULL); ! continue; ! } ! /* ! * Find or create an smgr hash entry for this relation. This ! * may seem a bit unclean -- md calling smgr? But it's really ! * the best solution. It ensures that the open file reference ! * isn't permanently leaked if we get an error here. (You may ! * say "but an unreferenced SMgrRelation is still a leak!" Not ! * really, because the only case in which a checkpoint is done ! * by a process that isn't about to shut down is in the ! * bgwriter, and it will periodically do smgrcloseall(). This ! * fact justifies our not closing the reln in the success path ! * either, which is a good thing since in non-bgwriter cases ! * we couldn't safely do that.) Furthermore, in many cases ! * the relation will have been dirtied through this same smgr ! * relation, and so we can save a file open/close cycle. ! */ ! reln = smgropen(entry->tag.rnode); ! /* ! * It is possible that the relation has been dropped or ! * truncated since the fsync request was entered. Therefore, ! * allow ENOENT, but only if we didn't fail once already on ! * this file. This applies both during _mdfd_getseg() and ! * during FileSync, since fd.c might have closed the file ! * behind our back. ! */ ! seg = _mdfd_getseg(reln, ! entry->tag.segno * ((BlockNumber) RELSEG_SIZE), ! false, EXTENSION_RETURN_NULL); ! if (seg == NULL || FileSync(seg->mdfd_vfd) < 0) ! { ! /* ! * The fsync table could contain requests to fsync relations that have ! * been deleted (unlinked) by the time we get to them. Rather than ! * just hoping an ENOENT (or EACCES on Windows) error can be ignored, ! * what we will do is retry after absorbing fsync ! * request messages again. Since mdunlink() queues a "revoke" message ! * before actually unlinking, the fsync request is guaranteed to be gone ! * the second time if it really was this case. DROP DATABASE likewise ! * has to tell us to forget fsync requests before it starts deletions. ! * ! * XXX is there any point in allowing more than one try? ! * Don't see one at the moment, but easy to change the ! * test here if so. ! */ ! if (!FILE_POSSIBLY_DELETED(errno) || ! ++(entry->failures) > 1) ! ereport(ERROR, ! (errcode_for_file_access(), ! errmsg("could not fsync segment %u of relation %u/%u/%u: %m", ! entry->tag.segno, ! entry->tag.rnode.spcNode, ! entry->tag.rnode.dbNode, ! entry->tag.rnode.relNode))); ! else ! ereport(DEBUG1, ! (errcode_for_file_access(), ! errmsg("could not fsync segment %u of relation %u/%u/%u, but retrying: %m", ! entry->tag.segno, ! entry->tag.rnode.spcNode, ! entry->tag.rnode.dbNode, ! entry->tag.rnode.relNode))); + } + else + { /* Okay, delete this entry */ if (hash_search(pendingOpsTable, &entry->tag, HASH_REMOVE, NULL) == NULL) elog(ERROR, "pendingOpsTable corrupted"); + + list_delete_cell(syncOps, cell, NULL); } ! } } /*
pgsql-patches by date: