Skip to content

Commit

Permalink
Update sm3.c
Browse files Browse the repository at this point in the history
Simple C implementation
  • Loading branch information
guanzhi committed Dec 13, 2023
1 parent 1d6c763 commit 7db4991
Showing 1 changed file with 29 additions and 192 deletions.
221 changes: 29 additions & 192 deletions src/sm3.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,129 +10,33 @@

#include <string.h>
#include <gmssl/sm3.h>
#include <gmssl/endian.h>
#include <gmssl/error.h>


#ifdef SM3_SSE3
# include <x86intrin.h>
# include <immintrin.h>

# define _mm_rotl_epi32(X,i) \
_mm_xor_si128(_mm_slli_epi32((X),(i)), _mm_srli_epi32((X),32-(i)))
#endif
#define GETU32(ptr) \
((uint32_t)(ptr)[0] << 24 | \
(uint32_t)(ptr)[1] << 16 | \
(uint32_t)(ptr)[2] << 8 | \
(uint32_t)(ptr)[3])

#define PUTU32(ptr,a) \
((ptr)[0] = (uint8_t)((a) >> 24), \
(ptr)[1] = (uint8_t)((a) >> 16), \
(ptr)[2] = (uint8_t)((a) >> 8), \
(ptr)[3] = (uint8_t)(a))

#define ROTL(x,n) (((x)<<(n)) | ((x)>>(32-(n))))
#define P0(x) ((x) ^ ROL32((x), 9) ^ ROL32((x),17))
#define P1(x) ((x) ^ ROL32((x),15) ^ ROL32((x),23))

#define P0(x) ((x) ^ ROTL((x), 9) ^ ROTL((x),17))
#define P1(x) ((x) ^ ROTL((x),15) ^ ROTL((x),23))

#define FF00(x,y,z) ((x) ^ (y) ^ (z))
#define FF16(x,y,z) (((x)&(y)) | ((x)&(z)) | ((y)&(z)))
#define GG00(x,y,z) ((x) ^ (y) ^ (z))
#define GG16(x,y,z) ((((y)^(z)) & (x)) ^ (z))

#define R(A, B, C, D, E, F, G, H, xx) \
SS1 = ROL32((ROL32(A, 12) + E + K[j]), 7); \
SS2 = SS1 ^ ROL32(A, 12); \
TT1 = FF##xx(A, B, C) + D + SS2 + (W[j] ^ W[j + 4]); \
TT2 = GG##xx(E, F, G) + H + SS1 + W[j]; \
B = ROL32(B, 9); \
H = TT1; \
F = ROL32(F, 19); \
D = P0(TT2); \
j++

#define R8(A, B, C, D, E, F, G, H, xx) \
R(A, B, C, D, E, F, G, H, xx); \
R(H, A, B, C, D, E, F, G, xx); \
R(G, H, A, B, C, D, E, F, xx); \
R(F, G, H, A, B, C, D, E, xx); \
R(E, F, G, H, A, B, C, D, xx); \
R(D, E, F, G, H, A, B, C, xx); \
R(C, D, E, F, G, H, A, B, xx); \
R(B, C, D, E, F, G, H, A, xx)



#define T00 0x79cc4519U
#define T16 0x7a879d8aU

#define K0 0x79cc4519U
#define K1 0xf3988a32U
#define K2 0xe7311465U
#define K3 0xce6228cbU
#define K4 0x9cc45197U
#define K5 0x3988a32fU
#define K6 0x7311465eU
#define K7 0xe6228cbcU
#define K8 0xcc451979U
#define K9 0x988a32f3U
#define K10 0x311465e7U
#define K11 0x6228cbceU
#define K12 0xc451979cU
#define K13 0x88a32f39U
#define K14 0x11465e73U
#define K15 0x228cbce6U
#define K16 0x9d8a7a87U
#define K17 0x3b14f50fU
#define K18 0x7629ea1eU
#define K19 0xec53d43cU
#define K20 0xd8a7a879U
#define K21 0xb14f50f3U
#define K22 0x629ea1e7U
#define K23 0xc53d43ceU
#define K24 0x8a7a879dU
#define K25 0x14f50f3bU
#define K26 0x29ea1e76U
#define K27 0x53d43cecU
#define K28 0xa7a879d8U
#define K29 0x4f50f3b1U
#define K30 0x9ea1e762U
#define K31 0x3d43cec5U
#define K32 0x7a879d8aU
#define K33 0xf50f3b14U
#define K34 0xea1e7629U
#define K35 0xd43cec53U
#define K36 0xa879d8a7U
#define K37 0x50f3b14fU
#define K38 0xa1e7629eU
#define K39 0x43cec53dU
#define K40 0x879d8a7aU
#define K41 0x0f3b14f5U
#define K42 0x1e7629eaU
#define K43 0x3cec53d4U
#define K44 0x79d8a7a8U
#define K45 0xf3b14f50U
#define K46 0xe7629ea1U
#define K47 0xcec53d43U
#define K48 0x9d8a7a87U
#define K49 0x3b14f50fU
#define K50 0x7629ea1eU
#define K51 0xec53d43cU
#define K52 0xd8a7a879U
#define K53 0xb14f50f3U
#define K54 0x629ea1e7U
#define K55 0xc53d43ceU
#define K56 0x8a7a879dU
#define K57 0x14f50f3bU
#define K58 0x29ea1e76U
#define K59 0x53d43cecU
#define K60 0xa7a879d8U
#define K61 0x4f50f3b1U
#define K62 0x9ea1e762U
#define K63 0x3d43cec5U

static uint32_t K[64] = {
K0, K1, K2, K3, K4, K5, K6, K7,
K8, K9, K10, K11, K12, K13, K14, K15,
K16, K17, K18, K19, K20, K21, K22, K23,
K24, K25, K26, K27, K28, K29, K30, K31,
K32, K33, K34, K35, K36, K37, K38, K39,
K40, K41, K42, K43, K44, K45, K46, K47,
K48, K49, K50, K51, K52, K53, K54, K55,
K56, K57, K58, K59, K60, K61, K62, K63,
/*
0x79cc4519U, 0xf3988a32U, 0xe7311465U, 0xce6228cbU,
0x9cc45197U, 0x3988a32fU, 0x7311465eU, 0xe6228cbcU,
0xcc451979U, 0x988a32f3U, 0x311465e7U, 0x6228cbceU,
Expand All @@ -149,10 +53,8 @@ static uint32_t K[64] = {
0xd8a7a879U, 0xb14f50f3U, 0x629ea1e7U, 0xc53d43ceU,
0x8a7a879dU, 0x14f50f3bU, 0x29ea1e76U, 0x53d43cecU,
0xa7a879d8U, 0x4f50f3b1U, 0x9ea1e762U, 0x3d43cec5U,
*/
};

#ifndef SM3_AVX_BMI2
void sm3_compress_blocks(uint32_t digest[8], const uint8_t *data, size_t blocks)
{
uint32_t A;
Expand All @@ -167,12 +69,6 @@ void sm3_compress_blocks(uint32_t digest[8], const uint8_t *data, size_t blocks)
uint32_t SS1, SS2, TT1, TT2;
int j;

#ifdef SM3_SSE3
__m128i X, T, R;
__m128i M = _mm_setr_epi32(0, 0, 0, 0xffffffff);
__m128i V = _mm_setr_epi8(3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12);
#endif

while (blocks--) {

A = digest[0];
Expand All @@ -184,103 +80,44 @@ void sm3_compress_blocks(uint32_t digest[8], const uint8_t *data, size_t blocks)
G = digest[6];
H = digest[7];


#ifdef SM3_SSE3

for (j = 0; j < 16; j += 4) {
X = _mm_loadu_si128((__m128i *)(data + j * 4));
X = _mm_shuffle_epi8(X, V);
_mm_storeu_si128((__m128i *)(W + j), X);
for (j = 0; j < 16; j++) {
W[j] = GETU32(data + j*4);
}

for (j = 16; j < 68; j += 4) {
/* X = (W[j - 3], W[j - 2], W[j - 1], 0) */
X = _mm_loadu_si128((__m128i *)(W + j - 3));
X = _mm_andnot_si128(M, X);

X = _mm_rotl_epi32(X, 15);
T = _mm_loadu_si128((__m128i *)(W + j - 9));
X = _mm_xor_si128(X, T);
T = _mm_loadu_si128((__m128i *)(W + j - 16));
X = _mm_xor_si128(X, T);

/* P1() */
T = _mm_rotl_epi32(X, (23 - 15));
T = _mm_xor_si128(T, X);
T = _mm_rotl_epi32(T, 15);
X = _mm_xor_si128(X, T);

T = _mm_loadu_si128((__m128i *)(W + j - 13));
T = _mm_rotl_epi32(T, 7);
X = _mm_xor_si128(X, T);
T = _mm_loadu_si128((__m128i *)(W + j - 6));
X = _mm_xor_si128(X, T);

/* W[j + 3] ^= P1(ROL32(W[j + 1], 15)) */
R = _mm_shuffle_epi32(X, 0);
R = _mm_and_si128(R, M);
T = _mm_rotl_epi32(R, 15);
T = _mm_xor_si128(T, R);
T = _mm_rotl_epi32(T, 9);
R = _mm_xor_si128(R, T);
R = _mm_rotl_epi32(R, 6);
X = _mm_xor_si128(X, R);

_mm_storeu_si128((__m128i *)(W + j), X);
for (; j < 68; j++) {
W[j] = P1(W[j - 16] ^ W[j - 9] ^ ROTL(W[j - 3], 15))
^ ROTL(W[j - 13], 7) ^ W[j - 6];
}
#else
for (j = 0; j < 16; j++)
W[j] = GETU32(data + j*4);

for (; j < 68; j++)
W[j] = P1(W[j - 16] ^ W[j - 9] ^ ROL32(W[j - 3], 15))
^ ROL32(W[j - 13], 7) ^ W[j - 6];
#endif


j = 0;

#define FULL_UNROLL
#ifdef FULL_UNROLL
R8(A, B, C, D, E, F, G, H, 00);
R8(A, B, C, D, E, F, G, H, 00);
R8(A, B, C, D, E, F, G, H, 16);
R8(A, B, C, D, E, F, G, H, 16);
R8(A, B, C, D, E, F, G, H, 16);
R8(A, B, C, D, E, F, G, H, 16);
R8(A, B, C, D, E, F, G, H, 16);
R8(A, B, C, D, E, F, G, H, 16);
#else
for (; j < 16; j++) {
SS1 = ROL32((ROL32(A, 12) + E + K(j)), 7);
SS2 = SS1 ^ ROL32(A, 12);
for (j = 0; j < 16; j++) {
SS1 = ROTL((ROTL(A, 12) + E + K[j]), 7);
SS2 = SS1 ^ ROTL(A, 12);
TT1 = FF00(A, B, C) + D + SS2 + (W[j] ^ W[j + 4]);
TT2 = GG00(E, F, G) + H + SS1 + W[j];
D = C;
C = ROL32(B, 9);
C = ROTL(B, 9);
B = A;
A = TT1;
H = G;
G = ROL32(F, 19);
G = ROTL(F, 19);
F = E;
E = P0(TT2);
}

for (; j < 64; j++) {
SS1 = ROL32((ROL32(A, 12) + E + K(j)), 7);
SS2 = SS1 ^ ROL32(A, 12);
SS1 = ROTL((ROTL(A, 12) + E + K[j]), 7);
SS2 = SS1 ^ ROTL(A, 12);
TT1 = FF16(A, B, C) + D + SS2 + (W[j] ^ W[j + 4]);
TT2 = GG16(E, F, G) + H + SS1 + W[j];
D = C;
C = ROL32(B, 9);
C = ROTL(B, 9);
B = A;
A = TT1;
H = G;
G = ROL32(F, 19);
G = ROTL(F, 19);
F = E;
E = P0(TT2);
}
#endif

digest[0] ^= A;
digest[1] ^= B;
Expand All @@ -294,7 +131,6 @@ void sm3_compress_blocks(uint32_t digest[8], const uint8_t *data, size_t blocks)
data += 64;
}
}
#endif

void sm3_init(SM3_CTX *ctx)
{
Expand Down Expand Up @@ -357,10 +193,11 @@ void sm3_finish(SM3_CTX *ctx, uint8_t *digest)
sm3_compress_blocks(ctx->digest, ctx->block, 1);
memset(ctx->block, 0, SM3_BLOCK_SIZE - 8);
}

PUTU32(ctx->block + 56, ctx->nblocks >> 23);
PUTU32(ctx->block + 60, (ctx->nblocks << 9) + (ctx->num << 3));

sm3_compress_blocks(ctx->digest, ctx->block, 1);

for (i = 0; i < 8; i++) {
PUTU32(digest + i*4, ctx->digest[i]);
}
Expand Down

0 comments on commit 7db4991

Please sign in to comment.