NAME

shared_rts -- runtime system for a shared communication domain

SYNOPSIS

   #include <afblib/shared_rts.h>

   bool shared_rts_run(unsigned int nofprocesses,
      size_t bufsize, size_t extra_space_size,
      const char* path, char** argv);

   struct shared_domain* shared_rts_init();
   void shared_rts_finish(struct shared_domain* sd);

DESCRIPTION

This module provides a runtime system for processes that want to communicate through a shared communication domain.

One process acts as master who creates the shared communication domain, starts the individual processes that are connected to the communication domain, and finishes everything as soon as all processes are terminated.

shared_rts_run is to be called by the master process and configures a shared communication for nofprocesses where the individual communication buffers have a size of bufsize bytes and optionally with some extra space in the shared memory segment of extra_space_size bytes. The nofprocesses worker processes are started (through fork and exec) where the parameters of the shared communication domain are passed through the environment (see shared_env). shared_rts_run blocks until all child processes are finished. If one of the child processes aborts or exists with a non-zero exit code, all other child processes are terminated using signal SIGTERM.

The individual worker processes invoke init_sm_rts at the beginning to connect to the communication domain and finish_sm_rts once they no longer need the connection.

EXAMPLE

This module allows to create a small utility to launch worker processes that share a communication domain:

   #include <errno.h>
   #include <stdbool.h>
   #include <stdio.h>
   #include <stdlib.h>
   #include <string.h>
   #include <afblib/shared_rts.h>

   bool convert_val(const char* str, unsigned long long* resp) {
      char* endptr;
      errno = 0;
      unsigned long long val = strtoull(str, &endptr, 0);
      if (errno) return false;
      if (*endptr) return false;
      *resp = val;
      return true;
   }

   char* cmdname;
   void usage() {
      fprintf(stderr,
         "Usage: %s "
         "[-bufsize bufsize] "
         "[-extra extra_bytes] "
         "[-np nofprocesses] "
         "cmd args...\n",
         cmdname);
      exit(1);
   }

   int main(int argc, char** argv) {
      cmdname = *argv++; --argc;
      unsigned int nofprocesses = 2; size_t bufsize = 1024;
      size_t extra_space_size = 0;
      while (argc > 1 && **argv == '-') {
         if (strcmp(*argv, "-np") == 0) {
            ++argv; --argc;
            unsigned long long val;
            if (!convert_val(*argv, &val)) usage();
            nofprocesses = val;
         } else if (strcmp(*argv, "-bufsize") == 0) {
            ++argv; --argc;
            unsigned long long val;
            if (!convert_val(*argv, &val)) usage();
            bufsize = val;
         } else if (strcmp(*argv, "-extra") == 0) {
            ++argv; --argc;
            unsigned long long val;
            if (!convert_val(*argv, &val)) usage();
            extra_space_size = val;
         } else {
            usage();
         }
         ++argv; --argc;
      }
      if (argc == 0) usage();
      if (shared_rts_run(nofprocesses, bufsize, extra_space_size, *argv, argv)) {
         exit(0);
      } else {
         fprintf(stderr, "%s: execution failed\n", cmdname);
         exit(1);
      }
   }

And following example demonstrates the view of the worker processes where all processes with the exception of rank 0 write a message to 0 with their rank:

   #include <stdio.h>
   #include <stdlib.h>
   #include <afblib/shared_domain.h>
   #include <afblib/shared_rts.h>

   int main() {
      struct shared_domain* sd = shared_rts_init();
      if (!sd) {
         printf("wasn't invoked by smrun\n"); exit(1);
      }
      unsigned int rank = sd_get_rank(sd);
      unsigned int nofprocesses = sd_get_nofprocesses(sd);
      if (rank == 0) {
         for (unsigned int i = 1; i < nofprocesses; ++i) {
            unsigned int other;
            if (sd_read(sd, &other, sizeof other)) {
               printf("process 0 received message from %u\n", other);
            }
         }
      } else {
         sd_write(sd, 0, &rank, sizeof rank);
      }
      shared_rts_finish(sd);
   }

AUTHOR

Andreas F. Borchert