Skip to content

Commit bc9cb13

Browse files
kernel cleanup
1 parent 2b7377f commit bc9cb13

File tree

3 files changed

+7
-133
lines changed

3 files changed

+7
-133
lines changed

libethash-cl/ethash_cl_miner.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -693,7 +693,6 @@ bool ethash_cl_miner::init(unsigned _platformId, unsigned _deviceId)
693693
// buffers
694694
m_challenge = cl::Buffer(m_context, CL_MEM_READ_ONLY, 32);
695695
m_sender = cl::Buffer(m_context, CL_MEM_READ_ONLY, 20);
696-
m_buff = cl::Buffer(m_context, CL_MEM_WRITE_ONLY, 200); // used for debugging
697696

698697
for (unsigned i = 0; i != c_bufferCount; ++i)
699698
{
@@ -707,7 +706,6 @@ bool ethash_cl_miner::init(unsigned _platformId, unsigned _deviceId)
707706

708707
m_searchKernel.setArg(1, m_sender);
709708
m_searchKernel.setArg(5, ~0u); // isolate argument
710-
m_searchKernel.setArg(6, m_buff);
711709
}
712710
catch (cl::Error const& err)
713711
{

libethash-cl/ethash_cl_miner.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ class ethash_cl_miner
104104
cl::Buffer m_searchBuffer[c_bufferCount];
105105
cl::Buffer m_nonceBuffer[c_bufferCount];
106106
cl::Buffer m_bestHashBuff;
107-
cl::Buffer m_challenge, m_sender, m_buff;
107+
cl::Buffer m_challenge, m_sender;
108108
unsigned m_globalWorkSize;
109109
bool m_openclOnePointOne;
110110

libethash-cl/ethash_cl_miner_kernel.cl

Lines changed: 6 additions & 130 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,7 @@ static void keccak_f1600_round(uint2* a, uint r)
205205
t[2] = a[2] ^ a[7] ^ a[12] ^ a[17] ^ a[22];
206206
t[3] = a[3] ^ a[8] ^ a[13] ^ a[18] ^ a[23];
207207
t[4] = a[4] ^ a[9] ^ a[14] ^ a[19] ^ a[24];
208+
208209
u = t[4] ^ ROL2_small(t[1], 1);
209210
a[0] ^= u;
210211
a[5] ^= u;
@@ -315,12 +316,7 @@ static void keccak_f1600_no_absorb(uint2* a, uint rounds, uint isolate)
315316
for (uint r = 0; r < rounds;)
316317
{
317318
// This dynamic branch stops the AMD compiler unrolling the loop
318-
// and additionally saves about 33% of the VGPRs, enough to gain another
319-
// wavefront. Ideally we'd get 4 in flight, but 3 is the best I can
320-
// massage out of the compiler. It doesn't really seem to matter how
321-
// much we try and help the compiler save VGPRs because it seems to throw
322-
// that information away, hence the implementation of keccak here
323-
// doesn't bother.
319+
324320
//if (isolate)
325321
//{
326322
keccak_f1600_round(a, r++);
@@ -329,125 +325,6 @@ static void keccak_f1600_no_absorb(uint2* a, uint rounds, uint isolate)
329325
}
330326

331327

332-
void keccak_alt(uint2* state, uint rounds)
333-
{
334-
// based on code from Azlehria's 0xbitcoin miner
335-
uint2 C[5], D[5];
336-
337-
for (uint i = 0; i < rounds; ++i)
338-
{
339-
C[0] = state[0] ^ state[5] ^ state[10] ^ state[15] ^ state[20];
340-
C[1] = state[1] ^ state[6] ^ state[11] ^ state[16] ^ state[21];
341-
C[2] = state[2] ^ state[7] ^ state[12] ^ state[17] ^ state[22];
342-
C[3] = state[3] ^ state[8] ^ state[13] ^ state[18] ^ state[23];
343-
C[4] = state[4] ^ state[9] ^ state[14] ^ state[19] ^ state[24];
344-
345-
D[0] = ROL2(C[1], 1) ^ C[4];
346-
state[0] ^= D[0];
347-
state[5] ^= D[0];
348-
state[10] ^= D[0];
349-
state[15] ^= D[0];
350-
state[20] ^= D[0];
351-
352-
D[0] = ROL2(C[2], 1) ^ C[0];
353-
state[1] ^= D[0];
354-
state[6] ^= D[0];
355-
state[11] ^= D[0];
356-
state[16] ^= D[0];
357-
state[21] ^= D[0];
358-
359-
D[0] = ROL2(C[3], 1) ^ C[1];
360-
state[2] ^= D[0];
361-
state[7] ^= D[0];
362-
state[12] ^= D[0];
363-
state[17] ^= D[0];
364-
state[22] ^= D[0];
365-
366-
D[0] = ROL2(C[4], 1) ^ C[2];
367-
state[3] ^= D[0];
368-
state[8] ^= D[0];
369-
state[13] ^= D[0];
370-
state[18] ^= D[0];
371-
state[23] ^= D[0];
372-
373-
D[0] = ROL2(C[0], 1) ^ C[3];
374-
state[4] ^= D[0];
375-
state[9] ^= D[0];
376-
state[14] ^= D[0];
377-
state[19] ^= D[0];
378-
state[24] ^= D[0];
379-
380-
C[0] = state[1];
381-
state[1] = ROL2(state[6], 44);
382-
state[6] = ROL2(state[9], 20);
383-
state[9] = ROL2(state[22], 61);
384-
state[22] = ROL2(state[14], 39);
385-
state[14] = ROL2(state[20], 18);
386-
state[20] = ROL2(state[2], 62);
387-
state[2] = ROL2(state[12], 43);
388-
state[12] = ROL2(state[13], 25);
389-
state[13] = ROL2(state[19], 8);
390-
state[19] = ROL2(state[23], 56);
391-
state[23] = ROL2(state[15], 41);
392-
state[15] = ROL2(state[4], 27);
393-
state[4] = ROL2(state[24], 14);
394-
state[24] = ROL2(state[21], 2);
395-
state[21] = ROL2(state[8], 55);
396-
state[8] = ROL2(state[16], 45);
397-
state[16] = ROL2(state[5], 36);
398-
state[5] = ROL2(state[3], 28);
399-
state[3] = ROL2(state[18], 21);
400-
state[18] = ROL2(state[17], 15);
401-
state[17] = ROL2(state[11], 10);
402-
state[11] = ROL2(state[7], 6);
403-
state[7] = ROL2(state[10], 3);
404-
state[10] = ROL2(C[0], 1);
405-
406-
C[0] = state[0];
407-
C[1] = state[1];
408-
state[0] = chi(state[0], state[1], state[2]);
409-
state[0] ^= Keccak_f1600_RC[i];
410-
state[1] = chi(state[1], state[2], state[3]);
411-
state[2] = chi(state[2], state[3], state[4]);
412-
state[3] = chi(state[3], state[4], C[0]);
413-
state[4] = chi(state[4], C[0], C[1]);
414-
415-
C[0] = state[5];
416-
C[1] = state[6];
417-
state[5] = chi(state[5], state[6], state[7]);
418-
state[6] = chi(state[6], state[7], state[8]);
419-
state[7] = chi(state[7], state[8], state[9]);
420-
state[8] = chi(state[8], state[9], C[0]);
421-
state[9] = chi(state[9], C[0], C[1]);
422-
423-
C[0] = state[10];
424-
C[1] = state[11];
425-
state[10] = chi(state[10], state[11], state[12]);
426-
state[11] = chi(state[11], state[12], state[13]);
427-
state[12] = chi(state[12], state[13], state[14]);
428-
state[13] = chi(state[13], state[14], C[0]);
429-
state[14] = chi(state[14], C[0], C[1]);
430-
431-
C[0] = state[15];
432-
C[1] = state[16];
433-
state[15] = chi(state[15], state[16], state[17]);
434-
state[16] = chi(state[16], state[17], state[18]);
435-
state[17] = chi(state[17], state[18], state[19]);
436-
state[18] = chi(state[18], state[19], C[0]);
437-
state[19] = chi(state[19], C[0], C[1]);
438-
439-
C[0] = state[20];
440-
C[1] = state[21];
441-
state[20] = chi(state[20], state[21], state[22]);
442-
state[21] = chi(state[21], state[22], state[23]);
443-
state[22] = chi(state[22], state[23], state[24]);
444-
state[23] = chi(state[23], state[24], C[0]);
445-
state[24] = chi(state[24], C[0], C[1]);
446-
447-
}
448-
449-
}
450-
451328

452329
/*-----------------------------------------------------------------------------------
453330
* test_keccak
@@ -462,10 +339,11 @@ __kernel void test_keccak(
462339
__global volatile uint* restrict g_output, // 32 bytes (8 uints)
463340
uint isolate
464341
) {
465-
uint const gid = get_global_id(0);
466-
if (gid != 6) return;
342+
// the assumption is that the kernel will be invoked with only 1 work item, since
343+
// every work item writes the results to the beginning of g_ouput.
467344

468345
hash200_t state;
346+
uint const gid = get_global_id(0);
469347

470348
copy(state.uchars, g_challenge, 32);
471349
copy(state.words + 8, g_sender, 5);
@@ -496,8 +374,7 @@ __kernel void bitcoin0x_search(
496374
__constant uint const* g_nonce, // 32 bytes (8 uints)
497375
__global volatile uint* restrict g_output,
498376
ulong target,
499-
uint isolate,
500-
__global volatile hash200_t* restrict g_buff // 200 bytes, used for debugging
377+
uint isolate
501378
)
502379
{
503380
uint const gid = get_global_id(0);
@@ -518,7 +395,6 @@ __kernel void bitcoin0x_search(
518395
state.uchars[84] = 0x01;
519396
state.uchars[135] = 0x80;
520397
keccak_f1600_no_absorb((uint2*) &state, 24, isolate);
521-
//keccak_alt((uint2*) &state, 23);
522398
//keccak_final_round((uint2*) &state);
523399

524400
// pick off upper 64 bits of hash

0 commit comments

Comments
 (0)