8.7. Extended Example: Parallel Modular Exponentiation¶

Modular exponentiation is a mathematical calculation that is used in a variety of applications, including public key cryptography. This operation simply consists of performing an integer exponentiation and applying a modulus. For instance, 23 mod 7 = 8 mod 7 ≡ 1 mod 7. This parallel form makes use of the fact that multiplication is associative; if we want to compute (23 * 54 ) mod 7, we can calculate 23 mod 7 and 54 mod 7 in parallel, then multiply their results together. This form uses an intentionally slow implementation of modular exponentiation to illustrate the performance improvement from parallelism.

  1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 #include #include #include #include #include #include #include #include #include uint64_t mod_power (uint64_t, uint64_t); void * mod_power_consumer (void *); uint64_t * randomize_powers (size_t, unsigned, double, size_t); /* Can use this to change the queue size */ #define QUEUE_SIZE 10 /* Synchronization primitives and global result */ pthread_mutex_t mutex; #define SEM_AVAILABLE "/OpenCSF_Available" #define SEM_READY "/OpenCSF_Ready" sem_t *space_available; sem_t *item_ready; uint64_t result = 1; /* Each entry is for a single base^power computation */ struct queue_entry { uint64_t base; uint64_t power; }; /* Variables that define the queue structure */ struct queue_entry queue[QUEUE_SIZE]; size_t next_out = 0; size_t next_in = 0; /* Set of 100 primes and the modulus to use for calculations */ uint64_t primes[] = { 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251, 257, 263, 269, 271, 277, 281, 283, 293, 307, 311, 313, 317, 331, 337, 347, 349, 353, 359, 367, 373, 379, 383, 389, 397, 401, 409, 419, 421, 431, 433, 439, 443, 449, 457, 461, 463, 467, 479, 487, 491, 499, 503, 509, 521, 523, 541 }; size_t primes_length = 100; uint64_t modulus = 524287; int main (void) { /* Change these numbers as desired. This set will create 4 threads to multiple 50% of the primes, each to a power of at least 10,000. */ size_t number_of_threads = 4; unsigned seed = 25; double threshold = 0.5; size_t minimum = 10000000; /* Generate an array of random powers to use */ uint64_t *powers = randomize_powers (primes_length, seed, threshold, minimum); /* Initialize synchronization primitives as needed */ pthread_mutex_init (&mutex, NULL); sem_unlink (SEM_AVAILABLE); // Delete an old semaphore instance sem_unlink (SEM_READY); // Delete an old semaphore instance space_available = sem_open (SEM_AVAILABLE, O_CREAT | O_EXCL, S_IRUSR | S_IWUSR, QUEUE_SIZE); item_ready = sem_open (SEM_READY, O_CREAT | O_EXCL, S_IRUSR | S_IWUSR, 0); /* Create the pool of worker threads; each will */ pthread_t threads[number_of_threads]; memset (threads, 0, sizeof (threads)); size_t i; for (i = 0; i < number_of_threads; i++) pthread_create (&threads[i], NULL, mod_power_consumer, NULL); /* Producer side of the producer/consumer. Select a pair of prime/power and add the entry to the queue. */ for (i = 0; i < primes_length; i++) { struct queue_entry entry; entry.base = primes[i]; entry.power = powers[i]; /* Wait until space is ready */ sem_wait (space_available); queue[next_in] = entry; next_in = (next_in + 1) % QUEUE_SIZE; /* Signal that an item was added */ sem_post (item_ready); } /* No more powers needed from the array */ free (powers); powers = NULL; /* Now start filling the queue with -1 values to terminate the helper threads */ for (i = 0; i < number_of_threads; i++) { struct queue_entry blank; blank.base = -1; blank.power = -1; /* Wait for space */ sem_wait (space_available); queue[next_in] = blank; next_in = (next_in + 1) % QUEUE_SIZE; /* Signal that an item was added */ sem_post (item_ready); } /* Join all of the threads when finished */ for (i = 0; i < number_of_threads; i++) pthread_join (threads[i], NULL); /* Clean up all synchronization primitives */ pthread_mutex_destroy (&mutex); sem_close (item_ready); sem_close (space_available); sem_unlink (SEM_AVAILABLE); sem_unlink (SEM_READY); printf ("Product is %" PRIu64 "\n", result); return EXIT_SUCCESS; } /* Consumer thread. Each thread runs until it receives a power of -1 (indicating finished). After a base/power pair is pulled from the queue, the thread will compute base^power mod modulus, then multiple that by the global result product. */ void * mod_power_consumer (void * _args) { struct queue_entry entry; while (true) { /* Consumer side waits until an item is ready */ sem_wait (item_ready); /* Multiple consumers, so lock the queue */ pthread_mutex_lock (&mutex); entry = queue[next_out]; next_out += 1; next_out %= QUEUE_SIZE; pthread_mutex_unlock (&mutex); /* Let producer no there's at least one space */ sem_post (space_available); /* Check for termination signal */ if (entry.power == -1) break; /* Computer base^power, then multiple result with the global product */ uint64_t modp = mod_power (entry.base, entry.power); pthread_mutex_lock (&mutex); if (entry.power > 1) printf ("%" PRIu64 " ^ %" PRIu64 " mod %" PRIu64 " = %" PRIu64 "\n", entry.base, entry.power, modulus, modp); result *= modp; result %= modulus; pthread_mutex_unlock (&mutex); } pthread_exit (NULL); } /* Intentionally slow mod power routine. Computes base^power mod modulus by multiplying a running product repeatedly by the base. Slow enough to see a speedup from parallel executions. */ uint64_t mod_power (uint64_t base, uint64_t power) { uint64_t index = 0; uint64_t product = 1; while (index < power) { product *= base; product %= modulus; index++; } return product; } /* Create an array of random powers for the primes. The threshold parameter determines (approximately) what percentage of these powers are non-zero. */ uint64_t * randomize_powers (size_t length, unsigned seed, double threshold, size_t minimum) { size_t i; uint64_t *pows = calloc (length, sizeof (uint64_t)); srand (seed); size_t randoms = length * threshold; for (i = 0; i < randoms; i++) { size_t index = rand () % length; pows[index] = (rand () % 10) * minimum; pows[index] += rand () % minimum; } return pows; }