Re: kern/56535

To: kern-bug-people%netbsd.org@localhost,gnats-admin%netbsd.org@localhost,netbsd-bugs%netbsd.org@localhost,mpratt%google.com@localhost
Subject: Re: kern/56535
From: Michael Pratt <mpratt%google.com@localhost>
Date: Tue, 14 Dec 2021 23:00:03 +0000 (UTC)

The following reply was made to PR kern/56535; it has been noted by GNATS.

From: Michael Pratt <mpratt%google.com@localhost>
To: gnats-bugs%netbsd.org@localhost
Cc: 
Subject: Re: kern/56535
Date: Tue, 14 Dec 2021 17:58:07 -0500

 We now have a reproducer for this problem in C, which I have posted at
 https://github.com/golang/go/issues/34988#issuecomment-994115345, and
 below for completeness:
 
 Notes:
 * Anecdotally, running several instances at the same time seems to
 speed up repro more than a linear speedup (system under more load?).
 That said, a single instance will eventually fail.
 * Sometimes I get several failures immediately, sometimes it takes
 >60s to get a failure.
 * The most immediately interesting part here is that I call the fork
 syscall directly rather than using fork()` from libc. I think this is
 simply because libc fork() is significantly slower to return than
 my_fork(), and we seem to have a small race window.
   * I've looked through _malloc_prefork/postfork and (I believe) all
 of the registered atfork callbacks, and none of them seem important,
 as neither thread is interacting with pthread or malloc.
 
 The summarized behavior we see is:
 1. `page->b = 102`
 2. `fork()`
 3. `page->b = 2`
 4. Read `page->b`, observe 102 instead of 2.
 5. When logging the corruption, we load `page->b` again, which
 typically observes 2 again.
 
 All while another thread is spinning writing to `page->c` (unrelated
 word in the same page).
 
 forkstress.c:
 
 #include <pthread.h>
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <sys/mman.h>
 #include <sys/wait.h>
 #include <unistd.h>
 
 uint64_t my_fork(void);
 
 void __attribute((noinline)) spin(uint64_t loops) {
   for (volatile uint64_t i = 0; i < loops; i++) {
   }
 }
 
 struct thing {
   uint64_t b;
   uint32_t c;  // Making this (plus 'sink' below) uint64_t may make
 repro take longer?
 };
 
 volatile struct thing* page;
 
 volatile uint32_t sink;
 
 int ready;
 
 void* thread(void* arg) {
   __atomic_store_n(&ready, 1, __ATOMIC_SEQ_CST);
 
   while (1) {
     // Spin not strictly required, but it speeds up repro in my case.
     spin(40*1000);
 
     // Atomic not required, this works too:
     // page->c = sink;
     __atomic_store_n(&page->c, sink, __ATOMIC_SEQ_CST);
     sink++;
   }
 }
 
 int main(void) {
   page = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_ANONYMOUS |
 MAP_PRIVATE, -1, 0);
   if (page == MAP_FAILED) {
     perror("mmap");
     return 1;
   }
 
   pthread_t thread_id;
   int ret = pthread_create(&thread_id, NULL, &thread, NULL);
   if (ret != 0) {
     perror("pthread_create");
     return 1;
   }
 
   // Wait for child thread to start.
   //
   // This is not required to repro, but eliminates racing fork+thread create as
   // a possibility.
   while (!__atomic_load_n(&ready, __ATOMIC_SEQ_CST)) {
   }
 
   int64_t i = 0;
   while (1) {
     i++;
     if (i % 10000 == 0) {
       printf("Loop %d...\n", i);
     }
 
     page->b = 102;
     // Does not work with libc fork(). libc fork() is significantly slower,
     // which may be the problem.
     uint64_t pid = my_fork();
     if (pid == 0) {
       /* Child */
       _exit(0);
     }
 
     /* Parent */
     /* spin(40*1000); may speed up repro. */
     page->b = 2;
     uint64_t pb = page->b;
     if (pb != 2) {
       printf("Corruption! pb, page->b = %lu, %lu\n", pb, page->b);
       _exit(1);
     }
 
     int status;
     ret = waitpid(pid, &status, 0);
     if (ret < 0) {
       perror("waitpid");
       return 1;
     }
     if (WEXITSTATUS(status) != 0) {
       printf("Bad child status %#x\n", status);
       return 1;
     }
   }
 }
 
 fork.S:
 
 /* These are NetBSD syscall numbers! */
 #define SYS_EXIT        1
 #define SYS_FORK        2
 
 .globl my_fork
 my_fork:
         movq $SYS_FORK, %rax
         syscall
 
         cmpq $0, %rax
         jne parent
 
         movq $0, %rdi
         movq $SYS_EXIT, %rax
         syscall
         hlt
 
 parent:
         ret
 
 Build and run:
 
 $ cc -pthread forkstress.c fork.S
 $ ./a.out & ./a.out & ./a.out  & ./a.out
 Loop 10000...
 Corruption! pb, page->b = 102, 2
 Loop 10000...
 Loop 10000...
 Loop 10000...
 Corruption! pb, page->b = 102, 2
 Loop 20000...
 Loop 20000...
 Corruption! pb, page->b = 102, 2

Prev by Date: Re: lib/55454 (wredrawln() in libcurses does not follow the sensible behaviour)
Next by Date: Re: kern/56535: Memory corruption in multi-threaded Go parent process following fork() on AMD CPUs
Previous by Thread: Re: lib/55454 (wredrawln() in libcurses does not follow the sensible behaviour)
Next by Thread: Re: lib/56298 (libcurses: assertion failure when $TERM lacks certain terminfo capabilities)
Indexes:

Home | Main Index | Thread Index | Old Index