«  8.6. Cigarette Smokers Problem and the Limits of Semaphores and Locks   ::   Contents   ::   9.1. Parallel and Distributed Systems  »

8.7. Extended Example: Parallel Modular Exponentiation

Modular exponentiation is a mathematical calculation that is used in a variety of applications, including public key cryptography. This operation simply consists of performing an integer exponentiation and applying a modulus. For instance, 23 mod 7 = 8 mod 7 ≡ 1 mod 7. This parallel form makes use of the fact that multiplication is associative; if we want to compute (23 * 54 ) mod 7, we can calculate 23 mod 7 and 54 mod 7 in parallel, then multiply their results together. This form uses an intentionally slow implementation of modular exponentiation to illustrate the performance improvement from parallelism.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
#include <assert.h>
#include <inttypes.h>
#include <pthread.h>
#include <semaphore.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>

uint64_t mod_power (uint64_t, uint64_t);
void * mod_power_consumer (void *);
uint64_t * randomize_powers (size_t, unsigned, double, size_t);

/* Can use this to change the queue size */
#define QUEUE_SIZE 10

/* Synchronization primitives and global result */
pthread_mutex_t mutex;
#define SEM_AVAILABLE "/OpenCSF_Available"
#define SEM_READY "/OpenCSF_Ready"
sem_t *space_available;
sem_t *item_ready;
uint64_t result = 1;

/* Each entry is for a single base^power computation */
struct queue_entry {
  uint64_t base;
  uint64_t power;
};

/* Variables that define the queue structure */
struct queue_entry queue[QUEUE_SIZE];
size_t next_out = 0;
size_t next_in = 0;

/* Set of 100 primes and the modulus to use for calculations */
uint64_t primes[] = {
  2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53,
  59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113,
  127, 131, 137, 139, 149, 151, 157, 163, 167, 173, 179, 181,
  191, 193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251,
  257, 263, 269, 271, 277, 281, 283, 293, 307, 311, 313, 317,
  331, 337, 347, 349, 353, 359, 367, 373, 379, 383, 389, 397,
  401, 409, 419, 421, 431, 433, 439, 443, 449, 457, 461, 463,
  467, 479, 487, 491, 499, 503, 509, 521, 523, 541
};
size_t primes_length = 100;
uint64_t modulus = 524287;

int
main (void)
{
  /* Change these numbers as desired. This set will create 4
     threads to multiple 50% of the primes, each to a power of
     at least 10,000. */
  size_t number_of_threads = 4;
  unsigned seed = 25;
  double threshold = 0.5;
  size_t minimum = 10000000;
  
  /* Generate an array of random powers to use */
  uint64_t *powers =
    randomize_powers (primes_length, seed, threshold, minimum);

  /* Initialize synchronization primitives as needed */
  pthread_mutex_init (&mutex, NULL);
  sem_unlink (SEM_AVAILABLE); // Delete an old semaphore instance
  sem_unlink (SEM_READY);     // Delete an old semaphore instance
  space_available =
    sem_open (SEM_AVAILABLE, O_CREAT | O_EXCL, S_IRUSR | S_IWUSR, QUEUE_SIZE);
  item_ready = sem_open (SEM_READY, O_CREAT | O_EXCL, S_IRUSR | S_IWUSR, 0);

  /* Create the pool of worker threads; each will  */
  pthread_t threads[number_of_threads];
  memset (threads, 0, sizeof (threads));

  size_t i;
  for (i = 0; i < number_of_threads; i++)
    pthread_create (&threads[i], NULL, mod_power_consumer, NULL);

  /* Producer side of the producer/consumer. Select a pair
     of prime/power and add the entry to the queue. */
  for (i = 0; i < primes_length; i++)
    {
      struct queue_entry entry;
      entry.base = primes[i];
      entry.power = powers[i];

      /* Wait until space is ready */
      sem_wait (space_available);
      queue[next_in] = entry;
      next_in = (next_in + 1) % QUEUE_SIZE;

      /* Signal that an item was added */
      sem_post (item_ready);
    }

  /* No more powers needed from the array */
  free (powers);
  powers = NULL;

  /* Now start filling the queue with -1 values to terminate
     the helper threads */
  for (i = 0; i < number_of_threads; i++)
    {
      struct queue_entry blank;
      blank.base = -1;
      blank.power = -1;
      
      /* Wait for space */
      sem_wait (space_available);
      queue[next_in] = blank;
      next_in = (next_in + 1) % QUEUE_SIZE;

      /* Signal that an item was added */
      sem_post (item_ready);
    }

  /* Join all of the threads when finished */
  for (i = 0; i < number_of_threads; i++)
    pthread_join (threads[i], NULL);

  /* Clean up all synchronization primitives */
  pthread_mutex_destroy (&mutex);
  sem_close (item_ready);
  sem_close (space_available);
  sem_unlink (SEM_AVAILABLE);
  sem_unlink (SEM_READY);

  printf ("Product is %" PRIu64 "\n", result);

  return EXIT_SUCCESS;
}

/* Consumer thread. Each thread runs until it receives a
   power of -1 (indicating finished). After a base/power
   pair is pulled from the queue, the thread will compute
   base^power mod modulus, then multiple that by the global
   result product. */
void *
mod_power_consumer (void * _args)
{
  struct queue_entry entry;
  while (true)
    {
      /* Consumer side waits until an item is ready */
      sem_wait (item_ready);

      /* Multiple consumers, so lock the queue */
      pthread_mutex_lock (&mutex);
      entry = queue[next_out];
      next_out += 1;
      next_out %= QUEUE_SIZE;
      pthread_mutex_unlock (&mutex);

      /* Let producer no there's at least one space */
      sem_post (space_available);

      /* Check for termination signal */
      if (entry.power == -1) break;

      /* Computer base^power, then multiple result with
         the global product */
      uint64_t modp = mod_power (entry.base, entry.power);
      pthread_mutex_lock (&mutex);
      if (entry.power > 1)
        printf ("%" PRIu64 " ^ %" PRIu64 " mod %" PRIu64 " = %" PRIu64 "\n",
                entry.base, entry.power, modulus, modp);
      result *= modp;
      result %= modulus;
      pthread_mutex_unlock (&mutex);
    }

  pthread_exit (NULL);
}

/* Intentionally slow mod power routine. Computes
   base^power mod modulus by multiplying a running
   product repeatedly by the base. Slow enough to
   see a speedup from parallel executions. */
uint64_t
mod_power (uint64_t base, uint64_t power)
{
  uint64_t index = 0;
  uint64_t product = 1;

  while (index < power)
    {
      product *= base;
      product %= modulus;
      index++;
    }
  return product;
}

/* Create an array of random powers for the primes.
   The threshold parameter determines (approximately)
   what percentage of these powers are non-zero. */
uint64_t *
randomize_powers (size_t length, unsigned seed,
                  double threshold, size_t minimum)
{
  size_t i;
  uint64_t *pows = calloc (length, sizeof (uint64_t));
  srand (seed);
  size_t randoms = length * threshold;

  for (i = 0; i < randoms; i++)
    {
      size_t index = rand () % length;
      pows[index] = (rand () % 10) * minimum;
      pows[index] += rand () % minimum;
    }
  return pows;
}
«  8.6. Cigarette Smokers Problem and the Limits of Semaphores and Locks   ::   Contents   ::   9.1. Parallel and Distributed Systems  »

Contact Us License