@@ -205,6 +205,7 @@ static void keccak_f1600_round(uint2* a, uint r)
205
205
t [2 ] = a [2 ] ^ a [7 ] ^ a [12 ] ^ a [17 ] ^ a [22 ];
206
206
t [3 ] = a [3 ] ^ a [8 ] ^ a [13 ] ^ a [18 ] ^ a [23 ];
207
207
t [4 ] = a [4 ] ^ a [9 ] ^ a [14 ] ^ a [19 ] ^ a [24 ];
208
+
208
209
u = t [4 ] ^ ROL2_small (t [1 ], 1 );
209
210
a [0 ] ^= u ;
210
211
a [5 ] ^= u ;
@@ -315,12 +316,7 @@ static void keccak_f1600_no_absorb(uint2* a, uint rounds, uint isolate)
315
316
for (uint r = 0 ; r < rounds ;)
316
317
{
317
318
// This dynamic branch stops the AMD compiler unrolling the loop
318
- // and additionally saves about 33% of the VGPRs, enough to gain another
319
- // wavefront. Ideally we'd get 4 in flight, but 3 is the best I can
320
- // massage out of the compiler. It doesn't really seem to matter how
321
- // much we try and help the compiler save VGPRs because it seems to throw
322
- // that information away, hence the implementation of keccak here
323
- // doesn't bother.
319
+
324
320
//if (isolate)
325
321
//{
326
322
keccak_f1600_round (a , r ++ );
@@ -329,125 +325,6 @@ static void keccak_f1600_no_absorb(uint2* a, uint rounds, uint isolate)
329
325
}
330
326
331
327
332
- void keccak_alt (uint2 * state , uint rounds )
333
- {
334
- // based on code from Azlehria's 0xbitcoin miner
335
- uint2 C [5 ], D [5 ];
336
-
337
- for (uint i = 0 ; i < rounds ; ++ i )
338
- {
339
- C [0 ] = state [0 ] ^ state [5 ] ^ state [10 ] ^ state [15 ] ^ state [20 ];
340
- C [1 ] = state [1 ] ^ state [6 ] ^ state [11 ] ^ state [16 ] ^ state [21 ];
341
- C [2 ] = state [2 ] ^ state [7 ] ^ state [12 ] ^ state [17 ] ^ state [22 ];
342
- C [3 ] = state [3 ] ^ state [8 ] ^ state [13 ] ^ state [18 ] ^ state [23 ];
343
- C [4 ] = state [4 ] ^ state [9 ] ^ state [14 ] ^ state [19 ] ^ state [24 ];
344
-
345
- D [0 ] = ROL2 (C [1 ], 1 ) ^ C [4 ];
346
- state [0 ] ^= D [0 ];
347
- state [5 ] ^= D [0 ];
348
- state [10 ] ^= D [0 ];
349
- state [15 ] ^= D [0 ];
350
- state [20 ] ^= D [0 ];
351
-
352
- D [0 ] = ROL2 (C [2 ], 1 ) ^ C [0 ];
353
- state [1 ] ^= D [0 ];
354
- state [6 ] ^= D [0 ];
355
- state [11 ] ^= D [0 ];
356
- state [16 ] ^= D [0 ];
357
- state [21 ] ^= D [0 ];
358
-
359
- D [0 ] = ROL2 (C [3 ], 1 ) ^ C [1 ];
360
- state [2 ] ^= D [0 ];
361
- state [7 ] ^= D [0 ];
362
- state [12 ] ^= D [0 ];
363
- state [17 ] ^= D [0 ];
364
- state [22 ] ^= D [0 ];
365
-
366
- D [0 ] = ROL2 (C [4 ], 1 ) ^ C [2 ];
367
- state [3 ] ^= D [0 ];
368
- state [8 ] ^= D [0 ];
369
- state [13 ] ^= D [0 ];
370
- state [18 ] ^= D [0 ];
371
- state [23 ] ^= D [0 ];
372
-
373
- D [0 ] = ROL2 (C [0 ], 1 ) ^ C [3 ];
374
- state [4 ] ^= D [0 ];
375
- state [9 ] ^= D [0 ];
376
- state [14 ] ^= D [0 ];
377
- state [19 ] ^= D [0 ];
378
- state [24 ] ^= D [0 ];
379
-
380
- C [0 ] = state [1 ];
381
- state [1 ] = ROL2 (state [6 ], 44 );
382
- state [6 ] = ROL2 (state [9 ], 20 );
383
- state [9 ] = ROL2 (state [22 ], 61 );
384
- state [22 ] = ROL2 (state [14 ], 39 );
385
- state [14 ] = ROL2 (state [20 ], 18 );
386
- state [20 ] = ROL2 (state [2 ], 62 );
387
- state [2 ] = ROL2 (state [12 ], 43 );
388
- state [12 ] = ROL2 (state [13 ], 25 );
389
- state [13 ] = ROL2 (state [19 ], 8 );
390
- state [19 ] = ROL2 (state [23 ], 56 );
391
- state [23 ] = ROL2 (state [15 ], 41 );
392
- state [15 ] = ROL2 (state [4 ], 27 );
393
- state [4 ] = ROL2 (state [24 ], 14 );
394
- state [24 ] = ROL2 (state [21 ], 2 );
395
- state [21 ] = ROL2 (state [8 ], 55 );
396
- state [8 ] = ROL2 (state [16 ], 45 );
397
- state [16 ] = ROL2 (state [5 ], 36 );
398
- state [5 ] = ROL2 (state [3 ], 28 );
399
- state [3 ] = ROL2 (state [18 ], 21 );
400
- state [18 ] = ROL2 (state [17 ], 15 );
401
- state [17 ] = ROL2 (state [11 ], 10 );
402
- state [11 ] = ROL2 (state [7 ], 6 );
403
- state [7 ] = ROL2 (state [10 ], 3 );
404
- state [10 ] = ROL2 (C [0 ], 1 );
405
-
406
- C [0 ] = state [0 ];
407
- C [1 ] = state [1 ];
408
- state [0 ] = chi (state [0 ], state [1 ], state [2 ]);
409
- state [0 ] ^= Keccak_f1600_RC [i ];
410
- state [1 ] = chi (state [1 ], state [2 ], state [3 ]);
411
- state [2 ] = chi (state [2 ], state [3 ], state [4 ]);
412
- state [3 ] = chi (state [3 ], state [4 ], C [0 ]);
413
- state [4 ] = chi (state [4 ], C [0 ], C [1 ]);
414
-
415
- C [0 ] = state [5 ];
416
- C [1 ] = state [6 ];
417
- state [5 ] = chi (state [5 ], state [6 ], state [7 ]);
418
- state [6 ] = chi (state [6 ], state [7 ], state [8 ]);
419
- state [7 ] = chi (state [7 ], state [8 ], state [9 ]);
420
- state [8 ] = chi (state [8 ], state [9 ], C [0 ]);
421
- state [9 ] = chi (state [9 ], C [0 ], C [1 ]);
422
-
423
- C [0 ] = state [10 ];
424
- C [1 ] = state [11 ];
425
- state [10 ] = chi (state [10 ], state [11 ], state [12 ]);
426
- state [11 ] = chi (state [11 ], state [12 ], state [13 ]);
427
- state [12 ] = chi (state [12 ], state [13 ], state [14 ]);
428
- state [13 ] = chi (state [13 ], state [14 ], C [0 ]);
429
- state [14 ] = chi (state [14 ], C [0 ], C [1 ]);
430
-
431
- C [0 ] = state [15 ];
432
- C [1 ] = state [16 ];
433
- state [15 ] = chi (state [15 ], state [16 ], state [17 ]);
434
- state [16 ] = chi (state [16 ], state [17 ], state [18 ]);
435
- state [17 ] = chi (state [17 ], state [18 ], state [19 ]);
436
- state [18 ] = chi (state [18 ], state [19 ], C [0 ]);
437
- state [19 ] = chi (state [19 ], C [0 ], C [1 ]);
438
-
439
- C [0 ] = state [20 ];
440
- C [1 ] = state [21 ];
441
- state [20 ] = chi (state [20 ], state [21 ], state [22 ]);
442
- state [21 ] = chi (state [21 ], state [22 ], state [23 ]);
443
- state [22 ] = chi (state [22 ], state [23 ], state [24 ]);
444
- state [23 ] = chi (state [23 ], state [24 ], C [0 ]);
445
- state [24 ] = chi (state [24 ], C [0 ], C [1 ]);
446
-
447
- }
448
-
449
- }
450
-
451
328
452
329
/*-----------------------------------------------------------------------------------
453
330
* test_keccak
@@ -462,10 +339,11 @@ __kernel void test_keccak(
462
339
__global volatile uint * restrict g_output , // 32 bytes (8 uints)
463
340
uint isolate
464
341
) {
465
- uint const gid = get_global_id ( 0 );
466
- if ( gid != 6 ) return ;
342
+ // the assumption is that the kernel will be invoked with only 1 work item, since
343
+ // every work item writes the results to the beginning of g_ouput.
467
344
468
345
hash200_t state ;
346
+ uint const gid = get_global_id (0 );
469
347
470
348
copy (state .uchars , g_challenge , 32 );
471
349
copy (state .words + 8 , g_sender , 5 );
@@ -496,8 +374,7 @@ __kernel void bitcoin0x_search(
496
374
__constant uint const * g_nonce , // 32 bytes (8 uints)
497
375
__global volatile uint * restrict g_output ,
498
376
ulong target ,
499
- uint isolate ,
500
- __global volatile hash200_t * restrict g_buff // 200 bytes, used for debugging
377
+ uint isolate
501
378
)
502
379
{
503
380
uint const gid = get_global_id (0 );
@@ -518,7 +395,6 @@ __kernel void bitcoin0x_search(
518
395
state .uchars [84 ] = 0x01 ;
519
396
state .uchars [135 ] = 0x80 ;
520
397
keccak_f1600_no_absorb ((uint2 * ) & state , 24 , isolate );
521
- //keccak_alt((uint2*) &state, 23);
522
398
//keccak_final_round((uint2*) &state);
523
399
524
400
// pick off upper 64 bits of hash
0 commit comments