Skip to content

Commit

Permalink
switch the BLAKE2 implementation to blake2b_simd/blake2s_simd
Browse files Browse the repository at this point in the history
This is mostly a large performance improvement. The BLAKE2b bench_10000
case is improved by about 30%. This implementation also detects SIMD
support at runtime, so the feature flags related to SIMD support are
removed.

The only performance loss is in the bench_10 cases, where the caller
repeatedly feeds input slices less than one block long. The BLAKE2s
bench_10 case is almost 20% slower. I'm not sure exactly why, but this
implementation optimizes for avoiding copies on long runs of input, so
it might just be that it's doing more math up front. This performance
issue disappears if the inputs are a full block or longer.

The only API consequence of this change is that the undocumented
with_parameter_block constructor is no longer supported. Callers who
need other parameters might prefer to use the blake2b_simd/blake2s_simd
APIs directly, which expose them in a safer way through a Params object.
  • Loading branch information
oconnor663 committed Aug 2, 2019
1 parent 526cc6e commit 4c5dc2f
Show file tree
Hide file tree
Showing 15 changed files with 41 additions and 884 deletions.
7 changes: 3 additions & 4 deletions blake2/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ digest = "0.8"
byte-tools = "0.3"
crypto-mac = "0.7"
opaque-debug = "0.2"
blake2b_simd = { version = "0.5", default-features = false }
blake2s_simd = { version = "0.5", default-features = false }

[dev-dependencies]
digest = { version = "0.8", features = ["dev"] }
Expand All @@ -22,10 +24,7 @@ hex-literal = "0.1"

[features]
default = ["std"]
std = ["digest/std", "crypto-mac/std"]
simd = []
simd_opt = ["simd"]
simd_asm = ["simd_opt"]
std = ["digest/std", "crypto-mac/std", "blake2b_simd/std", "blake2s_simd/std"]

[badges]
travis-ci = { repository = "RustCrypto/hashes" }
43 changes: 0 additions & 43 deletions blake2/src/as_bytes.rs

This file was deleted.

231 changes: 28 additions & 203 deletions blake2/src/blake2.rs
Original file line number Diff line number Diff line change
@@ -1,76 +1,26 @@
macro_rules! blake2_impl {
(
$state:ident, $fix_state:ident, $word:ident, $vec:ident, $bytes:ident,
$R1:expr, $R2:expr, $R3:expr, $R4:expr, $IV:expr,
$vardoc:expr, $doc:expr,
$state:ident, $fix_state:ident, $word:ident, $bytes:ident,
$vardoc:expr, $doc:expr, $lib:ident,
) => {

use $crate::as_bytes::AsBytes;
use $crate::simd::{Vector4, $vec};

use digest::{Input, BlockInput, FixedOutput, VariableOutput, Reset};
use digest::InvalidOutputSize;
use digest::generic_array::GenericArray;
use digest::generic_array::typenum::Unsigned;
use core::cmp;
use byte_tools::{copy, zero};
use byte_tools::copy;
use crypto_mac::{Mac, MacResult, InvalidKeyLength};

use $lib::Params;
use $lib::State;

type Output = GenericArray<u8, $bytes>;

#[derive(Clone)]
#[doc=$vardoc]
pub struct $state {
m: [$word; 16],
h: [$vec; 2],
t: u64,
n: usize,

h0: [$vec; 2],
m0: [$word; 16],
t0: u64,
}

#[inline(always)]
fn iv0() -> $vec { $vec::new($IV[0], $IV[1], $IV[2], $IV[3]) }
#[inline(always)]
fn iv1() -> $vec { $vec::new($IV[4], $IV[5], $IV[6], $IV[7]) }

#[inline(always)]
fn quarter_round(v: &mut [$vec; 4], rd: u32, rb: u32, m: $vec) {
v[0] = v[0].wrapping_add(v[1]).wrapping_add(m.from_le());
v[3] = (v[3] ^ v[0]).rotate_right_const(rd);
v[2] = v[2].wrapping_add(v[3]);
v[1] = (v[1] ^ v[2]).rotate_right_const(rb);
}

#[inline(always)]
fn shuffle(v: &mut [$vec; 4]) {
v[1] = v[1].shuffle_left_1();
v[2] = v[2].shuffle_left_2();
v[3] = v[3].shuffle_left_3();
}

#[inline(always)]
fn unshuffle(v: &mut [$vec; 4]) {
v[1] = v[1].shuffle_right_1();
v[2] = v[2].shuffle_right_2();
v[3] = v[3].shuffle_right_3();
}

#[inline(always)]
fn round(v: &mut [$vec; 4], m: &[$word; 16], s: &[usize; 16]) {
quarter_round(v, $R1, $R2, $vec::gather(m,
s[ 0], s[ 2], s[ 4], s[ 6]));
quarter_round(v, $R3, $R4, $vec::gather(m,
s[ 1], s[ 3], s[ 5], s[ 7]));

shuffle(v);
quarter_round(v, $R1, $R2, $vec::gather(m,
s[ 8], s[10], s[12], s[14]));
quarter_round(v, $R3, $R4, $vec::gather(m,
s[ 9], s[11], s[13], s[15]));
unshuffle(v);
params: Params,
state: State,
output_size: usize,
}

impl $state {
Expand All @@ -80,156 +30,33 @@ macro_rules! blake2_impl {
/// make sure to compare codes in constant time! It can be done
/// for example by using `subtle` crate.
pub fn new_keyed(key: &[u8], output_size: usize) -> Self {
let kk = key.len();
assert!(kk <= $bytes::to_usize());
assert!(output_size <= $bytes::to_usize());

let p0 = 0x0101_0000 ^ ((kk as $word) << 8) ^
(output_size as $word);
let h0 = [iv0() ^ $vec::new(p0, 0, 0, 0), iv1()];
let mut state = $state {
m: [0; 16],
h: h0,
t: 0,
n: output_size,

t0: 0,
m0: [0; 16],
h0: h0,
};

if kk > 0 {
copy(key, state.m.as_mut_bytes());
state.t = 2 * $bytes::to_u64();
}

state.t0 = state.t;
state.m0 = state.m;
state
}

#[doc(hidden)]
pub fn with_parameter_block(p: &[$word; 8]) -> Self {
let nn = p[0] as u8 as usize;
let kk = (p[0] >> 8) as u8 as usize;
assert!(nn >= 1 && nn <= $bytes::to_usize());
assert!(kk <= $bytes::to_usize());

let h0 = [
iv0() ^ $vec::new(p[0], p[1], p[2], p[3]),
iv1() ^ $vec::new(p[4], p[5], p[6], p[7]),
];

$state {
m: [0; 16],
h: h0,
t: 0,
n: nn,

t0: 0,
m0: [0; 16],
h0: h0,
let mut params = Params::new();
params.hash_length(output_size);
params.key(key);
Self {
state: params.to_state(),
params,
output_size,
}
}

/// Updates the hashing context with more data.
fn update(&mut self, data: &[u8]) {
let mut rest = data;

let block = 2 * $bytes::to_usize();

let off = self.t as usize % block;
if off != 0 || self.t == 0 {
let len = cmp::min(block - off, rest.len());

let part = &rest[..len];
rest = &rest[part.len()..];

copy(part, &mut self.m.as_mut_bytes()[off..]);
self.t = self.t.checked_add(part.len() as u64)
.expect("hash data length overflow");
}

while rest.len() >= block {
self.compress(0, 0);

let part = &rest[..block];
rest = &rest[part.len()..];

copy(part, &mut self.m.as_mut_bytes());
self.t = self.t.checked_add(part.len() as u64)
.expect("hash data length overflow");
}

let n = rest.len();
if n > 0 {
self.compress(0, 0);

copy(rest, &mut self.m.as_mut_bytes());
self.t = self.t.checked_add(rest.len() as u64)
.expect("hash data length overflow");
}
self.state.update(data);
}

#[doc(hidden)]
pub fn finalize_last_node(self) -> Output {
self.finalize_with_flag(!0)
self.finalize_with_last_node(true)
}


fn finalize_with_flag(mut self, f1: $word) -> Output {
let off = self.t as usize % (2 * $bytes::to_usize());
if off != 0 {
zero(&mut self.m.as_mut_bytes()[off..]);
}

self.compress(!0, f1);

let buf = [self.h[0].to_le(), self.h[1].to_le()];

fn finalize_with_last_node(mut self, last_node: bool) -> Output {
self.state.set_last_node(last_node);
let hash = self.state.finalize();
let mut out = GenericArray::default();
copy(buf.as_bytes(), &mut out);
copy(hash.as_bytes(), &mut out);
out
}

fn compress(&mut self, f0: $word, f1: $word) {
use $crate::consts::SIGMA;

let m = &self.m;
let h = &mut self.h;

let t0 = self.t as $word;
let t1 = match $bytes::to_u8() {
64 => 0,
32 => (self.t >> 32) as $word,
_ => unreachable!(),
};

let mut v = [
h[0],
h[1],
iv0(),
iv1() ^ $vec::new(t0, t1, f0, f1),
];

round(&mut v, m, &SIGMA[0]);
round(&mut v, m, &SIGMA[1]);
round(&mut v, m, &SIGMA[2]);
round(&mut v, m, &SIGMA[3]);
round(&mut v, m, &SIGMA[4]);
round(&mut v, m, &SIGMA[5]);
round(&mut v, m, &SIGMA[6]);
round(&mut v, m, &SIGMA[7]);
round(&mut v, m, &SIGMA[8]);
round(&mut v, m, &SIGMA[9]);
if $bytes::to_u8() == 64 {
round(&mut v, m, &SIGMA[0]);
round(&mut v, m, &SIGMA[1]);
}

h[0] = h[0] ^ (v[0] ^ v[2]);
h[1] = h[1] ^ (v[1] ^ v[3]);
}
}

impl Default for $state {
Expand All @@ -255,21 +82,19 @@ macro_rules! blake2_impl {
}

fn output_size(&self) -> usize {
self.n
self.output_size
}

fn variable_result<F: FnOnce(&[u8])>(self, f: F) {
let n = self.n;
let res = self.finalize_with_flag(0);
let n = self.output_size;
let res = self.finalize_with_last_node(false);
f(&res[..n]);
}
}

impl Reset for $state {
fn reset(&mut self) {
self.t = self.t0;
self.m = self.m0;
self.h = self.h0;
self.state = self.params.to_state();
}
}

Expand Down Expand Up @@ -304,7 +129,7 @@ macro_rules! blake2_impl {
type OutputSize = $bytes;

fn fixed_result(self) -> Output {
self.state.finalize_with_flag(0)
self.state.finalize_with_last_node(false)
}
}

Expand Down Expand Up @@ -339,7 +164,7 @@ macro_rules! blake2_impl {
}

fn result(self) -> MacResult<Self::OutputSize> {
MacResult::new(self.state.finalize_with_flag(0))
MacResult::new(self.state.finalize_with_last_node(false))
}
}

Expand Down
5 changes: 2 additions & 3 deletions blake2/src/blake2b.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
use digest::generic_array::typenum::U64;
use consts::BLAKE2B_IV;

blake2_impl!(VarBlake2b, Blake2b, u64, u64x4, U64,
32, 24, 16, 63, BLAKE2B_IV,
blake2_impl!(VarBlake2b, Blake2b, u64, U64,
"Blake2b instance with a variable output.",
"Blake2b instance with a fixed output.",
blake2b_simd,
);
5 changes: 2 additions & 3 deletions blake2/src/blake2s.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
use digest::generic_array::typenum::U32;
use consts::BLAKE2S_IV;

blake2_impl!(VarBlake2s, Blake2s, u32, u32x4, U32,
16, 12, 8, 7, BLAKE2S_IV,
blake2_impl!(VarBlake2s, Blake2s, u32, U32,
"Blake2s instance with a variable output.",
"Blake2s instance with a fixed output.",
blake2s_simd,
);
Loading

0 comments on commit 4c5dc2f

Please sign in to comment.